ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List, Union
|
|
3
|
+
from pathlib import PurePath, Path
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz # PyMuPDF
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import (
|
|
8
|
+
RecursiveCharacterTextSplitter
|
|
9
|
+
)
|
|
10
|
+
from .basepdf import BasePDF
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PDFChapterLoader(BasePDF):
|
|
14
|
+
"""
|
|
15
|
+
Preserving Chapter Structure from PDF files.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
super().__init__(
|
|
27
|
+
path=path,
|
|
28
|
+
tokenizer=tokenizer,
|
|
29
|
+
text_splitter=text_splitter,
|
|
30
|
+
source_type=source_type,
|
|
31
|
+
language=language,
|
|
32
|
+
**kwargs
|
|
33
|
+
)
|
|
34
|
+
# Which Font is used for titles (Chapter separation)
|
|
35
|
+
self.title_font: list = kwargs.get('title_font', 'Calibri-Bold')
|
|
36
|
+
if not text_splitter:
|
|
37
|
+
self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
|
38
|
+
self.tokenizer,
|
|
39
|
+
chunk_size=2000,
|
|
40
|
+
chunk_overlap=100,
|
|
41
|
+
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
|
42
|
+
strip_whitespace=True, # strips whitespace from the start and end
|
|
43
|
+
separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def eval_title(self, title_font: str) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Check if the font is a title font.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
title_font (str): The font to check.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if the font is a title font.
|
|
55
|
+
"""
|
|
56
|
+
return 'Bold' in title_font or title_font == self.title_font
|
|
57
|
+
|
|
58
|
+
def _load_pdf(self, path: PurePath, **kwargs):
|
|
59
|
+
"""
|
|
60
|
+
Open a PDF file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
path (PurePath): The path to the PDF file.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
pdfplumber.PDF: The PDF object.
|
|
67
|
+
"""
|
|
68
|
+
pdf = fitz.open(path)
|
|
69
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
70
|
+
chapters = []
|
|
71
|
+
current_chapter_text = ''
|
|
72
|
+
current_chapter_title = ''
|
|
73
|
+
current_chapter_page = None
|
|
74
|
+
chapter_titles = set() # Keep track of unique chapter titles
|
|
75
|
+
for page_num in range(len(pdf)):
|
|
76
|
+
page = pdf.load_page(page_num)
|
|
77
|
+
blocks = page.get_text("dict")["blocks"]
|
|
78
|
+
page_number = page_num + 1
|
|
79
|
+
metadata = {
|
|
80
|
+
"url": '',
|
|
81
|
+
"index": f"{path.name} #{page_number}",
|
|
82
|
+
"source": f"{path.name} #{page_number}",
|
|
83
|
+
"filename": path.name,
|
|
84
|
+
"source_type": self._source_type,
|
|
85
|
+
"type": "pdf",
|
|
86
|
+
"question": "",
|
|
87
|
+
"answer": "",
|
|
88
|
+
"summary": '',
|
|
89
|
+
"document_meta": {
|
|
90
|
+
"page_number": page_num,
|
|
91
|
+
# **pdf.metadata
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
for b in blocks:
|
|
95
|
+
if b['type'] == 0: # Text block
|
|
96
|
+
block_text = ''
|
|
97
|
+
for line in b["lines"]:
|
|
98
|
+
for span in line["spans"]:
|
|
99
|
+
block_text += span['text'] # Accumulate text within the block
|
|
100
|
+
|
|
101
|
+
# Check if the block text is a title by examining the font
|
|
102
|
+
if any(self.eval_title(span['font']) for line in b["lines"] for span in line["spans"]):
|
|
103
|
+
title = block_text.strip()
|
|
104
|
+
if title not in chapter_titles:
|
|
105
|
+
# Save the current chapter if it's not empty and start a new one
|
|
106
|
+
if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
|
|
107
|
+
chapters.append({
|
|
108
|
+
'chapter': current_chapter_title,
|
|
109
|
+
'content': current_chapter_text.strip(),
|
|
110
|
+
'page': current_chapter_page,
|
|
111
|
+
'meta': metadata
|
|
112
|
+
})
|
|
113
|
+
current_chapter_title = f"**{title}**: "
|
|
114
|
+
current_chapter_page = page_num + 1
|
|
115
|
+
current_chapter_text = current_chapter_title
|
|
116
|
+
chapter_titles.add(title)
|
|
117
|
+
else:
|
|
118
|
+
# Continue appending to the existing chapter
|
|
119
|
+
current_chapter_text += block_text
|
|
120
|
+
else:
|
|
121
|
+
# Continue appending text to the current chapter
|
|
122
|
+
current_chapter_text += block_text
|
|
123
|
+
|
|
124
|
+
# Add a newline after processing each block, if not a chapter title
|
|
125
|
+
if not block_text.strip().startswith(current_chapter_title):
|
|
126
|
+
current_chapter_text += "\n"
|
|
127
|
+
|
|
128
|
+
# Save the last chapter if it exists and it's not just the title
|
|
129
|
+
if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
|
|
130
|
+
chapters.append({
|
|
131
|
+
'chapter': current_chapter_title,
|
|
132
|
+
'content': current_chapter_text.strip(),
|
|
133
|
+
'page': current_chapter_page,
|
|
134
|
+
'meta': metadata
|
|
135
|
+
})
|
|
136
|
+
documents = []
|
|
137
|
+
for chapter in chapters:
|
|
138
|
+
documents.append(Document(
|
|
139
|
+
page_content=chapter['content'],
|
|
140
|
+
metadata=chapter['meta']
|
|
141
|
+
))
|
|
142
|
+
return documents
|
parrot/loaders/pdffn.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List, Union
|
|
3
|
+
from pathlib import PurePath, Path
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz # PyMuPDF
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import (
|
|
8
|
+
RecursiveCharacterTextSplitter
|
|
9
|
+
)
|
|
10
|
+
from .basepdf import BasePDF
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PDFFnLoader(BasePDF):
|
|
14
|
+
"""
|
|
15
|
+
Loading a PDF with including function processing.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Union[None, Callable[..., Any]] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
table_settings = kwargs.pop('table_settings', {})
|
|
27
|
+
super().__init__(
|
|
28
|
+
path=path,
|
|
29
|
+
tokenizer=tokenizer,
|
|
30
|
+
text_splitter=text_splitter,
|
|
31
|
+
source_type=source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
**kwargs
|
|
34
|
+
)
|
|
35
|
+
if not text_splitter:
|
|
36
|
+
self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
|
37
|
+
self.tokenizer,
|
|
38
|
+
chunk_size=2000,
|
|
39
|
+
chunk_overlap=100,
|
|
40
|
+
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
|
41
|
+
strip_whitespace=True, # strips whitespace from the start and end
|
|
42
|
+
separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
|
|
43
|
+
)
|
|
44
|
+
self.table_settings = {
|
|
45
|
+
"vertical_strategy": "lines",
|
|
46
|
+
"horizontal_strategy": "text",
|
|
47
|
+
"intersection_x_tolerance": 5,
|
|
48
|
+
"intersection_y_tolerance": 5,
|
|
49
|
+
"edge_min_length": 10,
|
|
50
|
+
}
|
|
51
|
+
# Define settings for Fitz Table Processing
|
|
52
|
+
self.table_settings = {**self.table_settings, **table_settings}
|
|
53
|
+
|
|
54
|
+
def set_metadata(self, path, page, page_number, **kwargs) -> dict:
|
|
55
|
+
n = page_number + 1
|
|
56
|
+
return {
|
|
57
|
+
"url": '',
|
|
58
|
+
"index": f"{path.name} #{page_number}",
|
|
59
|
+
"source": f"{path.name} #{page_number}",
|
|
60
|
+
"filename": path.name,
|
|
61
|
+
"source_type": self._source_type,
|
|
62
|
+
"type": "pdf",
|
|
63
|
+
"question": "",
|
|
64
|
+
"answer": "",
|
|
65
|
+
"summary": '',
|
|
66
|
+
"document_meta": {
|
|
67
|
+
"page_number": n,
|
|
68
|
+
**kwargs
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def processing_table(self, table, table_idx, page, **kwargs) -> dict:
|
|
73
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
|
74
|
+
df = df.dropna(axis=1, how='all')
|
|
75
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
76
|
+
table_data = []
|
|
77
|
+
# Extract text from each cell
|
|
78
|
+
for row_idx in range(table.row_count):
|
|
79
|
+
for col_idx in range(table.column_count):
|
|
80
|
+
cell = table[row_idx][col_idx]
|
|
81
|
+
print('CELL ', cell)
|
|
82
|
+
print('---------')
|
|
83
|
+
cell_text = cell.get_text("text", flags=fitz.TEXTFLAGS_HTML)
|
|
84
|
+
print(cell_text)
|
|
85
|
+
|
|
86
|
+
return table_data
|
|
87
|
+
|
|
88
|
+
def _load_pdf(self, path: PurePath, **kwargs):
|
|
89
|
+
"""
|
|
90
|
+
Open a PDF file.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
path (PurePath): The path to the PDF file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
fitz.PDF: The PDF object.
|
|
97
|
+
"""
|
|
98
|
+
pdf = fitz.open(path)
|
|
99
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
100
|
+
for page_num in range(len(pdf)):
|
|
101
|
+
# Will extract first the table and second the block of texts
|
|
102
|
+
page = pdf.load_page(page_num)
|
|
103
|
+
parts = page.get_text("dict", flags=fitz.TEXTFLAGS_HTML)
|
|
104
|
+
# print('PARTS ', parts)
|
|
105
|
+
blocks = page.get_text("dict")["blocks"]
|
|
106
|
+
# print('BLOCKS >', blocks)
|
|
107
|
+
metadata = self.set_metadata(path, page, page_num)
|
|
108
|
+
# print('META > ', metadata)
|
|
109
|
+
tables = page.find_tables(**self.table_settings)
|
|
110
|
+
for tab_idx, table in enumerate(tables):
|
|
111
|
+
table_data = self.processing_table(table, tab_idx, page)
|
|
112
|
+
return []
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
import fitz
|
|
5
|
+
from pdf4llm import to_markdown
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from langchain.docstore.document import Document
|
|
8
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
9
|
+
from transformers import (
|
|
10
|
+
AutoTokenizer,
|
|
11
|
+
AutoProcessor,
|
|
12
|
+
LlavaForConditionalGeneration,
|
|
13
|
+
pipeline,
|
|
14
|
+
BitsAndBytesConfig
|
|
15
|
+
)
|
|
16
|
+
import torch
|
|
17
|
+
from .basepdf import BasePDF
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
quantization_config = BitsAndBytesConfig(
|
|
21
|
+
load_in_4bit=True,
|
|
22
|
+
bnb_4bit_compute_dtype=torch.float16
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PDFImageLoader(BasePDF):
|
|
27
|
+
"""
|
|
28
|
+
Loader for PDF files.
|
|
29
|
+
"""
|
|
30
|
+
default_prompt: str = "<|user|>\n<image>\nExplain this schematic diagram or technical installation instructions and wire diagrams, please be detailed about descriptions of steps:<|end|>\n<|assistant|>\n"
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
path: PurePath,
|
|
34
|
+
tokenizer: Callable[..., Any] = None,
|
|
35
|
+
text_splitter: Callable[..., Any] = None,
|
|
36
|
+
source_type: str = 'pdf',
|
|
37
|
+
language: str = "eng",
|
|
38
|
+
**kwargs
|
|
39
|
+
):
|
|
40
|
+
super().__init__(
|
|
41
|
+
path=path,
|
|
42
|
+
tokenizer=tokenizer,
|
|
43
|
+
text_splitter=text_splitter,
|
|
44
|
+
source_type=source_type,
|
|
45
|
+
language=language,
|
|
46
|
+
**kwargs
|
|
47
|
+
)
|
|
48
|
+
self._image_model = kwargs.get('image_model', 'llava-hf/llava-v1.6-vicuna-7b-hf')
|
|
49
|
+
self._task = kwargs.get('task', 'image-to-text')
|
|
50
|
+
self._max_tokens = kwargs.get('max_tokens', 600)
|
|
51
|
+
# Loading the model with low CPU memory usage
|
|
52
|
+
# model = LlavaForConditionalGeneration.from_pretrained(
|
|
53
|
+
# self._image_model,
|
|
54
|
+
# quantization_config=quantization_config,
|
|
55
|
+
# device_map="auto",
|
|
56
|
+
# torch_dtype=torch.float16,
|
|
57
|
+
# low_cpu_mem_usage=True
|
|
58
|
+
# )
|
|
59
|
+
# # Load the processor
|
|
60
|
+
# processor = AutoProcessor.from_pretrained(self._image_model, use_fast=True)
|
|
61
|
+
self._pipeline = pipeline(
|
|
62
|
+
self._task,
|
|
63
|
+
model=self._image_model,
|
|
64
|
+
# tokenizer=processor.tokenizer,
|
|
65
|
+
# image_processor=processor.image_processor,
|
|
66
|
+
model_kwargs={"quantization_config": quantization_config},
|
|
67
|
+
# device=self._device,
|
|
68
|
+
max_new_tokens=self._max_tokens,
|
|
69
|
+
# low_cpu_mem_usage=True,
|
|
70
|
+
use_fast=True
|
|
71
|
+
)
|
|
72
|
+
# default prompt
|
|
73
|
+
self._prompt = kwargs.get('prompt', self.default_prompt)
|
|
74
|
+
# Markdown Splitter
|
|
75
|
+
self._splitter = MarkdownTextSplitter(
|
|
76
|
+
chunk_size = self._chunk_size,
|
|
77
|
+
chunk_overlap=10
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def pixmap_to_pil_image(self, pix):
|
|
81
|
+
"""Converts a PyMuPDF Pixmap object to a PIL Image"""
|
|
82
|
+
return Image.frombytes(
|
|
83
|
+
"RGB",
|
|
84
|
+
[pix.width, pix.height],
|
|
85
|
+
pix.samples
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _load_pdf(self, path: Path) -> list:
|
|
89
|
+
"""
|
|
90
|
+
Load a PDF file as Images.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
path (Path): The path to the PDF file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
list: A list of Langchain Documents.
|
|
97
|
+
"""
|
|
98
|
+
if self._check_path(path):
|
|
99
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
100
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
101
|
+
docs = []
|
|
102
|
+
try:
|
|
103
|
+
# get markdown for all pages and saved separately
|
|
104
|
+
md_text = to_markdown(pdf)
|
|
105
|
+
try:
|
|
106
|
+
summary = self.get_summary_from_text(md_text)
|
|
107
|
+
except Exception:
|
|
108
|
+
summary = ''
|
|
109
|
+
metadata = {
|
|
110
|
+
"url": '',
|
|
111
|
+
"idx": str(path.name),
|
|
112
|
+
"filename": str(path.name),
|
|
113
|
+
"source": str(path.name),
|
|
114
|
+
"type": 'pdf',
|
|
115
|
+
"question": '',
|
|
116
|
+
"answer": '',
|
|
117
|
+
"data": {},
|
|
118
|
+
"summary": summary,
|
|
119
|
+
"source_type": self._source_type,
|
|
120
|
+
"document_meta": {
|
|
121
|
+
"title": pdf.metadata.get("title", ""),
|
|
122
|
+
"author": pdf.metadata.get("author", ""),
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
for idx, chunk in enumerate(self._splitter.split_text(md_text)):
|
|
126
|
+
_info = {
|
|
127
|
+
"index": f"{idx}",
|
|
128
|
+
**metadata
|
|
129
|
+
}
|
|
130
|
+
docs.append(
|
|
131
|
+
Document(
|
|
132
|
+
page_content=chunk,
|
|
133
|
+
metadata=_info
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
except (IndexError, ValueError) as exc:
|
|
137
|
+
self.logger.warning(
|
|
138
|
+
f"There is no text data to load on {path.name}: {exc}"
|
|
139
|
+
)
|
|
140
|
+
# Then, processing the pages one by one as Images:
|
|
141
|
+
file_name = path.stem.replace(" ", "_").replace(".", "_")
|
|
142
|
+
for page_number in range(pdf.page_count):
|
|
143
|
+
page_num = page_number + 1
|
|
144
|
+
self.logger.notice(
|
|
145
|
+
f"Processing PDF {path} on Page {page_num}"
|
|
146
|
+
)
|
|
147
|
+
page = pdf[page_number]
|
|
148
|
+
pix = page.get_pixmap(colorspace=fitz.csRGB, alpha=False)
|
|
149
|
+
zoom_x = 2.0 # horizontal zoom
|
|
150
|
+
zoom_y = 2.0 # vertical zoom
|
|
151
|
+
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension
|
|
152
|
+
img_stream = self.pixmap_to_pil_image(pix)
|
|
153
|
+
url = ''
|
|
154
|
+
img_name = f'image_{file_name}_{page_num}.png'
|
|
155
|
+
if self.save_images is True:
|
|
156
|
+
img_path = self.save_image(
|
|
157
|
+
img_stream,
|
|
158
|
+
img_name,
|
|
159
|
+
self._imgdir
|
|
160
|
+
)
|
|
161
|
+
url = f'/static/images/{img_name}'
|
|
162
|
+
# extracting features and explanations:
|
|
163
|
+
outputs = self._pipeline(
|
|
164
|
+
img_stream,
|
|
165
|
+
prompt=self._prompt,
|
|
166
|
+
generate_kwargs={"max_new_tokens": self._max_tokens}
|
|
167
|
+
)
|
|
168
|
+
documents = []
|
|
169
|
+
for idx, output in enumerate(outputs):
|
|
170
|
+
generated_text = output['generated_text']
|
|
171
|
+
# Split using the special tokens, if available
|
|
172
|
+
split_text = generated_text.split("<|assistant|>")
|
|
173
|
+
prompt_text = split_text[0].replace("<|prompt|>", "").strip() if "<|prompt|>" in generated_text else ""
|
|
174
|
+
response_text = split_text[1].strip() if len(split_text) > 1 else ""
|
|
175
|
+
# Attach the image using Markdown syntax
|
|
176
|
+
image_markdown = f"\n\n\n"
|
|
177
|
+
response_text += image_markdown
|
|
178
|
+
_meta = {
|
|
179
|
+
"url": f"{url}",
|
|
180
|
+
"filename": str(path.name),
|
|
181
|
+
"index": f"Page {page_num}, part: {idx}",
|
|
182
|
+
"source": str(path.name),
|
|
183
|
+
"type": 'pdf',
|
|
184
|
+
"question": prompt_text,
|
|
185
|
+
"answer": '',
|
|
186
|
+
"data": {},
|
|
187
|
+
"summary": '',
|
|
188
|
+
"source_type": self._source_type,
|
|
189
|
+
"document_meta": {
|
|
190
|
+
"page": f"Page {page}",
|
|
191
|
+
"image": f"{img_name}",
|
|
192
|
+
"url": f"{url}"
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
documents.append(
|
|
196
|
+
Document(
|
|
197
|
+
page_content=response_text,
|
|
198
|
+
metadata=_meta
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
return docs + documents
|
|
202
|
+
|
|
203
|
+
def load(self) -> list:
|
|
204
|
+
try:
|
|
205
|
+
return super().load()
|
|
206
|
+
finally:
|
|
207
|
+
self._pipeline = None
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
import fitz
|
|
5
|
+
from pdf4llm import to_markdown
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
8
|
+
from .basepdf import BasePDF
|
|
9
|
+
|
|
10
|
+
class PDFMarkdownLoader(BasePDF):
|
|
11
|
+
"""
|
|
12
|
+
Loader for PDF files converted content to markdown.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'pdf',
|
|
21
|
+
language: str = "eng",
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__(
|
|
25
|
+
path=path,
|
|
26
|
+
tokenizer=tokenizer,
|
|
27
|
+
text_splitter=text_splitter,
|
|
28
|
+
source_type=source_type,
|
|
29
|
+
language=language,
|
|
30
|
+
**kwargs
|
|
31
|
+
)
|
|
32
|
+
self._splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
|
|
33
|
+
|
|
34
|
+
def _load_pdf(self, path: Path) -> list:
|
|
35
|
+
"""
|
|
36
|
+
Load a PDF file using the PDFMiner library.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
path (Path): The path to the PDF file.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
list: A list of Langchain Documents.
|
|
43
|
+
"""
|
|
44
|
+
if self._check_path(path):
|
|
45
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
46
|
+
docs = []
|
|
47
|
+
pdf = fitz.open(str(path))
|
|
48
|
+
md_text = to_markdown(pdf) # get markdown for all pages
|
|
49
|
+
try:
|
|
50
|
+
summary = self.get_summary_from_text(md_text)
|
|
51
|
+
except Exception:
|
|
52
|
+
summary = ''
|
|
53
|
+
metadata = {
|
|
54
|
+
"url": '',
|
|
55
|
+
"filename": path.name,
|
|
56
|
+
# "index": f"{path.name}",
|
|
57
|
+
"source": str(path.name),
|
|
58
|
+
"type": 'pdf',
|
|
59
|
+
"question": '',
|
|
60
|
+
"answer": '',
|
|
61
|
+
"data": {},
|
|
62
|
+
"summary": summary,
|
|
63
|
+
"source_type": self._source_type,
|
|
64
|
+
"document_meta": {
|
|
65
|
+
"title": pdf.metadata.get("title", ""),
|
|
66
|
+
# "subject": pdf.metadata.get("subject", ""),
|
|
67
|
+
# "keywords": pdf.metadata.get("keywords", ""),
|
|
68
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
|
69
|
+
# "modDate": pdf.metadata.get("modDate", ""),
|
|
70
|
+
# "producer": pdf.metadata.get("producer", ""),
|
|
71
|
+
# "creator": pdf.metadata.get("creator", ""),
|
|
72
|
+
"author": pdf.metadata.get("author", ""),
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
for idx, chunk in enumerate(self._splitter.split_text(md_text)):
|
|
76
|
+
_info = {
|
|
77
|
+
"index": f"{idx}",
|
|
78
|
+
**metadata
|
|
79
|
+
}
|
|
80
|
+
docs.append(
|
|
81
|
+
Document(
|
|
82
|
+
page_content=chunk,
|
|
83
|
+
metadata=_info
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
return docs
|
|
87
|
+
else:
|
|
88
|
+
return []
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from langchain.docstore.document import Document
|
|
8
|
+
from .basepdf import BasePDF
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PDFTablesLoader(BasePDF):
|
|
12
|
+
"""
|
|
13
|
+
Loader for Tables in PDF Files.
|
|
14
|
+
"""
|
|
15
|
+
_extension = ['.pdf']
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
table_settings: dict = {},
|
|
25
|
+
**kwargs
|
|
26
|
+
):
|
|
27
|
+
super().__init__(
|
|
28
|
+
path,
|
|
29
|
+
tokenizer,
|
|
30
|
+
text_splitter,
|
|
31
|
+
source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
**kwargs
|
|
34
|
+
)
|
|
35
|
+
# Table Settings:
|
|
36
|
+
self.table_settings = {
|
|
37
|
+
#"vertical_strategy": "text",
|
|
38
|
+
# "horizontal_strategy": "text",
|
|
39
|
+
"intersection_x_tolerance": 5,
|
|
40
|
+
"intersection_y_tolerance": 5
|
|
41
|
+
}
|
|
42
|
+
if table_settings:
|
|
43
|
+
self.table_settings.update(table_settings)
|
|
44
|
+
self._skiprows = kwargs.pop('skiprows', None)
|
|
45
|
+
|
|
46
|
+
def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
47
|
+
"""
|
|
48
|
+
Rename duplicate columns in the DataFrame to ensure they are unique.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
df (pd.DataFrame): The DataFrame with potential duplicate column names.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
pd.DataFrame: A DataFrame with unique column names.
|
|
55
|
+
"""
|
|
56
|
+
seen = {}
|
|
57
|
+
new_columns = []
|
|
58
|
+
for col in df.columns:
|
|
59
|
+
new_col = col
|
|
60
|
+
count = seen.get(col, 0)
|
|
61
|
+
while new_col in new_columns:
|
|
62
|
+
count += 1
|
|
63
|
+
new_col = f"{col}_{count}"
|
|
64
|
+
new_columns.append(new_col)
|
|
65
|
+
seen[col] = count
|
|
66
|
+
df.columns = new_columns
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
def get_markdown(self, df: pd.DataFrame) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Convert a DataFrame to a Markdown string.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
df (pd.DataFrame): The DataFrame to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
str: The JSON string.
|
|
78
|
+
"""
|
|
79
|
+
buffer = StringIO()
|
|
80
|
+
df = self.unique_columns(df)
|
|
81
|
+
df.to_markdown(buffer)
|
|
82
|
+
buffer.seek(0)
|
|
83
|
+
return buffer.getvalue()
|
|
84
|
+
|
|
85
|
+
def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
|
|
86
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
|
87
|
+
df = df.dropna(axis=1, how='all')
|
|
88
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
89
|
+
page = page_number + 1
|
|
90
|
+
table_meta = {
|
|
91
|
+
"url": '',
|
|
92
|
+
"source": f"{path.name} Page.#{page} Table.#{table_idx}",
|
|
93
|
+
"filename": path.name,
|
|
94
|
+
"index": f"{path.name}:Table:{table_idx}",
|
|
95
|
+
"question": '',
|
|
96
|
+
"answer": '',
|
|
97
|
+
"type": 'table',
|
|
98
|
+
"data": {},
|
|
99
|
+
"summary": '',
|
|
100
|
+
"document_meta": {
|
|
101
|
+
"table_index": table_idx,
|
|
102
|
+
"table_shape": df.shape,
|
|
103
|
+
"table_columns": df.columns.tolist(),
|
|
104
|
+
"description": f"Extracted from Page.#{page}."
|
|
105
|
+
},
|
|
106
|
+
"source_type": self._source_type
|
|
107
|
+
}
|
|
108
|
+
return df, table_meta
|
|
109
|
+
|
|
110
|
+
def _load_pdf(self, path: Path) -> list:
|
|
111
|
+
"""
|
|
112
|
+
Load a PDF file using the Fitz library.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
path (Path): The path to the PDF file.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
list: A list of Langchain Documents.
|
|
119
|
+
"""
|
|
120
|
+
if self._check_path(path):
|
|
121
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
122
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
123
|
+
docs = []
|
|
124
|
+
for page_number in range(pdf.page_count):
|
|
125
|
+
page = pdf[page_number]
|
|
126
|
+
try:
|
|
127
|
+
tabs = page.find_tables(**self.table_settings)
|
|
128
|
+
for tab_idx, tab in enumerate(tabs):
|
|
129
|
+
df, _meta = self.parse_table(tab_idx, tab, page_number, path)
|
|
130
|
+
## Sample information:
|
|
131
|
+
print('::: Printing Table Information === ')
|
|
132
|
+
print(df)
|
|
133
|
+
print("::: Printing Column Information === ")
|
|
134
|
+
for column, t in df.dtypes.items():
|
|
135
|
+
print(column, "->", t, "->", df[column].iloc[0])
|
|
136
|
+
# convert into markdown:
|
|
137
|
+
txt = df.to_markdown()
|
|
138
|
+
if txt:
|
|
139
|
+
docs.append(
|
|
140
|
+
Document(page_content=txt, metadata=_meta)
|
|
141
|
+
)
|
|
142
|
+
except Exception as exc:
|
|
143
|
+
print(exc)
|
|
144
|
+
continue
|
|
145
|
+
return docs
|