ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.1.0.dist-info/LICENSE +21 -0
- ai_parrot-0.1.0.dist-info/METADATA +299 -0
- ai_parrot-0.1.0.dist-info/RECORD +108 -0
- ai_parrot-0.1.0.dist-info/WHEEL +5 -0
- ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
- parrot/__init__.py +18 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +965 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +257 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +100 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/oddie.py +17 -0
- parrot/chatbots/retrievals/__init__.py +515 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +108 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +169 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +0 -0
- parrot/llms/abstract.py +41 -0
- parrot/llms/anthropic.py +36 -0
- parrot/llms/google.py +37 -0
- parrot/llms/groq.py +33 -0
- parrot/llms/hf.py +39 -0
- parrot/llms/openai.py +49 -0
- parrot/llms/pipes.py +103 -0
- parrot/llms/vertex.py +68 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +187 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +107 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +152 -0
- parrot/models.py +347 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +0 -0
- parrot/stores/abstract.py +170 -0
- parrot/stores/milvus.py +540 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +16 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
import fitz
|
|
5
|
+
from pdf4llm import to_markdown
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from langchain.docstore.document import Document
|
|
8
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
9
|
+
from transformers import (
|
|
10
|
+
AutoTokenizer,
|
|
11
|
+
AutoProcessor,
|
|
12
|
+
LlavaForConditionalGeneration,
|
|
13
|
+
pipeline,
|
|
14
|
+
BitsAndBytesConfig
|
|
15
|
+
)
|
|
16
|
+
import torch
|
|
17
|
+
from .basepdf import BasePDF
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
quantization_config = BitsAndBytesConfig(
|
|
21
|
+
load_in_4bit=True,
|
|
22
|
+
bnb_4bit_compute_dtype=torch.float16
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PDFImageLoader(BasePDF):
|
|
27
|
+
"""
|
|
28
|
+
Loader for PDF files.
|
|
29
|
+
"""
|
|
30
|
+
default_prompt: str = "<|user|>\n<image>\nExplain this schematic diagram or technical installation instructions and wire diagrams, please be detailed about descriptions of steps:<|end|>\n<|assistant|>\n"
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
path: PurePath,
|
|
34
|
+
tokenizer: Callable[..., Any] = None,
|
|
35
|
+
text_splitter: Callable[..., Any] = None,
|
|
36
|
+
source_type: str = 'pdf',
|
|
37
|
+
language: str = "eng",
|
|
38
|
+
**kwargs
|
|
39
|
+
):
|
|
40
|
+
super().__init__(
|
|
41
|
+
path=path,
|
|
42
|
+
tokenizer=tokenizer,
|
|
43
|
+
text_splitter=text_splitter,
|
|
44
|
+
source_type=source_type,
|
|
45
|
+
language=language,
|
|
46
|
+
**kwargs
|
|
47
|
+
)
|
|
48
|
+
self._image_model = kwargs.get('image_model', 'llava-hf/llava-v1.6-vicuna-7b-hf')
|
|
49
|
+
self._task = kwargs.get('task', 'image-to-text')
|
|
50
|
+
self._max_tokens = kwargs.get('max_tokens', 600)
|
|
51
|
+
# Loading the model with low CPU memory usage
|
|
52
|
+
# model = LlavaForConditionalGeneration.from_pretrained(
|
|
53
|
+
# self._image_model,
|
|
54
|
+
# quantization_config=quantization_config,
|
|
55
|
+
# device_map="auto",
|
|
56
|
+
# torch_dtype=torch.float16,
|
|
57
|
+
# low_cpu_mem_usage=True
|
|
58
|
+
# )
|
|
59
|
+
# # Load the processor
|
|
60
|
+
# processor = AutoProcessor.from_pretrained(self._image_model, use_fast=True)
|
|
61
|
+
self._pipeline = pipeline(
|
|
62
|
+
self._task,
|
|
63
|
+
model=self._image_model,
|
|
64
|
+
# tokenizer=processor.tokenizer,
|
|
65
|
+
# image_processor=processor.image_processor,
|
|
66
|
+
model_kwargs={"quantization_config": quantization_config},
|
|
67
|
+
# device=self._device,
|
|
68
|
+
max_new_tokens=self._max_tokens,
|
|
69
|
+
# low_cpu_mem_usage=True,
|
|
70
|
+
use_fast=True
|
|
71
|
+
)
|
|
72
|
+
# default prompt
|
|
73
|
+
self._prompt = kwargs.get('prompt', self.default_prompt)
|
|
74
|
+
# Markdown Splitter
|
|
75
|
+
self._splitter = MarkdownTextSplitter(
|
|
76
|
+
chunk_size = self._chunk_size,
|
|
77
|
+
chunk_overlap=10
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def pixmap_to_pil_image(self, pix):
|
|
81
|
+
"""Converts a PyMuPDF Pixmap object to a PIL Image"""
|
|
82
|
+
return Image.frombytes(
|
|
83
|
+
"RGB",
|
|
84
|
+
[pix.width, pix.height],
|
|
85
|
+
pix.samples
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def _load_pdf(self, path: Path) -> list:
|
|
89
|
+
"""
|
|
90
|
+
Load a PDF file as Images.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
path (Path): The path to the PDF file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
list: A list of Langchain Documents.
|
|
97
|
+
"""
|
|
98
|
+
if self._check_path(path):
|
|
99
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
100
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
101
|
+
docs = []
|
|
102
|
+
try:
|
|
103
|
+
# get markdown for all pages and saved separately
|
|
104
|
+
md_text = to_markdown(pdf)
|
|
105
|
+
try:
|
|
106
|
+
summary = self.get_summary_from_text(md_text)
|
|
107
|
+
except Exception:
|
|
108
|
+
summary = ''
|
|
109
|
+
metadata = {
|
|
110
|
+
"url": '',
|
|
111
|
+
"idx": str(path.name),
|
|
112
|
+
"filename": str(path.name),
|
|
113
|
+
"source": str(path.name),
|
|
114
|
+
"type": 'pdf',
|
|
115
|
+
"question": '',
|
|
116
|
+
"answer": '',
|
|
117
|
+
"data": {},
|
|
118
|
+
"summary": summary,
|
|
119
|
+
"source_type": self._source_type,
|
|
120
|
+
"document_meta": {
|
|
121
|
+
"title": pdf.metadata.get("title", ""),
|
|
122
|
+
"author": pdf.metadata.get("author", ""),
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
for idx, chunk in enumerate(self._splitter.split_text(md_text)):
|
|
126
|
+
_info = {
|
|
127
|
+
"index": f"{idx}",
|
|
128
|
+
**metadata
|
|
129
|
+
}
|
|
130
|
+
docs.append(
|
|
131
|
+
Document(
|
|
132
|
+
page_content=chunk,
|
|
133
|
+
metadata=_info
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
except (IndexError, ValueError) as exc:
|
|
137
|
+
self.logger.warning(
|
|
138
|
+
f"There is no text data to load on {path.name}: {exc}"
|
|
139
|
+
)
|
|
140
|
+
# Then, processing the pages one by one as Images:
|
|
141
|
+
file_name = path.stem.replace(" ", "_").replace(".", "_")
|
|
142
|
+
for page_number in range(pdf.page_count):
|
|
143
|
+
page_num = page_number + 1
|
|
144
|
+
self.logger.notice(
|
|
145
|
+
f"Processing PDF {path} on Page {page_num}"
|
|
146
|
+
)
|
|
147
|
+
page = pdf[page_number]
|
|
148
|
+
pix = page.get_pixmap(colorspace=fitz.csRGB, alpha=False)
|
|
149
|
+
zoom_x = 2.0 # horizontal zoom
|
|
150
|
+
zoom_y = 2.0 # vertical zoom
|
|
151
|
+
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension
|
|
152
|
+
img_stream = self.pixmap_to_pil_image(pix)
|
|
153
|
+
url = ''
|
|
154
|
+
img_name = f'image_{file_name}_{page_num}.png'
|
|
155
|
+
if self.save_images is True:
|
|
156
|
+
img_path = self.save_image(
|
|
157
|
+
img_stream,
|
|
158
|
+
img_name,
|
|
159
|
+
self._imgdir
|
|
160
|
+
)
|
|
161
|
+
url = f'/static/images/{img_name}'
|
|
162
|
+
# extracting features and explanations:
|
|
163
|
+
outputs = self._pipeline(
|
|
164
|
+
img_stream,
|
|
165
|
+
prompt=self._prompt,
|
|
166
|
+
generate_kwargs={"max_new_tokens": self._max_tokens}
|
|
167
|
+
)
|
|
168
|
+
documents = []
|
|
169
|
+
for idx, output in enumerate(outputs):
|
|
170
|
+
generated_text = output['generated_text']
|
|
171
|
+
# Split using the special tokens, if available
|
|
172
|
+
split_text = generated_text.split("<|assistant|>")
|
|
173
|
+
prompt_text = split_text[0].replace("<|prompt|>", "").strip() if "<|prompt|>" in generated_text else ""
|
|
174
|
+
response_text = split_text[1].strip() if len(split_text) > 1 else ""
|
|
175
|
+
# Attach the image using Markdown syntax
|
|
176
|
+
image_markdown = f"\n\n\n"
|
|
177
|
+
response_text += image_markdown
|
|
178
|
+
_meta = {
|
|
179
|
+
"url": f"{url}",
|
|
180
|
+
"filename": str(path.name),
|
|
181
|
+
"index": f"Page {page_num}, part: {idx}",
|
|
182
|
+
"source": str(path.name),
|
|
183
|
+
"type": 'pdf',
|
|
184
|
+
"question": prompt_text,
|
|
185
|
+
"answer": '',
|
|
186
|
+
"data": {},
|
|
187
|
+
"summary": '',
|
|
188
|
+
"source_type": self._source_type,
|
|
189
|
+
"document_meta": {
|
|
190
|
+
"page": f"Page {page}",
|
|
191
|
+
"image": f"{img_name}",
|
|
192
|
+
"url": f"{url}"
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
documents.append(
|
|
196
|
+
Document(
|
|
197
|
+
page_content=response_text,
|
|
198
|
+
metadata=_meta
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
return docs + documents
|
|
202
|
+
|
|
203
|
+
def load(self) -> list:
|
|
204
|
+
try:
|
|
205
|
+
return super().load()
|
|
206
|
+
finally:
|
|
207
|
+
self._pipeline = None
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
import fitz
|
|
5
|
+
from pdf4llm import to_markdown
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
8
|
+
from .basepdf import BasePDF
|
|
9
|
+
|
|
10
|
+
class PDFMarkdownLoader(BasePDF):
|
|
11
|
+
"""
|
|
12
|
+
Loader for PDF files converted content to markdown.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'pdf',
|
|
21
|
+
language: str = "eng",
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__(
|
|
25
|
+
path=path,
|
|
26
|
+
tokenizer=tokenizer,
|
|
27
|
+
text_splitter=text_splitter,
|
|
28
|
+
source_type=source_type,
|
|
29
|
+
language=language,
|
|
30
|
+
**kwargs
|
|
31
|
+
)
|
|
32
|
+
self._splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
|
|
33
|
+
|
|
34
|
+
def _load_pdf(self, path: Path) -> list:
|
|
35
|
+
"""
|
|
36
|
+
Load a PDF file using the PDFMiner library.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
path (Path): The path to the PDF file.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
list: A list of Langchain Documents.
|
|
43
|
+
"""
|
|
44
|
+
if self._check_path(path):
|
|
45
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
46
|
+
docs = []
|
|
47
|
+
pdf = fitz.open(str(path))
|
|
48
|
+
md_text = to_markdown(pdf) # get markdown for all pages
|
|
49
|
+
try:
|
|
50
|
+
summary = self.get_summary_from_text(md_text)
|
|
51
|
+
except Exception:
|
|
52
|
+
summary = ''
|
|
53
|
+
metadata = {
|
|
54
|
+
"url": '',
|
|
55
|
+
"filename": path.name,
|
|
56
|
+
# "index": f"{path.name}",
|
|
57
|
+
"source": str(path.name),
|
|
58
|
+
"type": 'pdf',
|
|
59
|
+
"question": '',
|
|
60
|
+
"answer": '',
|
|
61
|
+
"data": {},
|
|
62
|
+
"summary": summary,
|
|
63
|
+
"source_type": self._source_type,
|
|
64
|
+
"document_meta": {
|
|
65
|
+
"title": pdf.metadata.get("title", ""),
|
|
66
|
+
# "subject": pdf.metadata.get("subject", ""),
|
|
67
|
+
# "keywords": pdf.metadata.get("keywords", ""),
|
|
68
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
|
69
|
+
# "modDate": pdf.metadata.get("modDate", ""),
|
|
70
|
+
# "producer": pdf.metadata.get("producer", ""),
|
|
71
|
+
# "creator": pdf.metadata.get("creator", ""),
|
|
72
|
+
"author": pdf.metadata.get("author", ""),
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
for idx, chunk in enumerate(self._splitter.split_text(md_text)):
|
|
76
|
+
_info = {
|
|
77
|
+
"index": f"{idx}",
|
|
78
|
+
**metadata
|
|
79
|
+
}
|
|
80
|
+
docs.append(
|
|
81
|
+
Document(
|
|
82
|
+
page_content=chunk,
|
|
83
|
+
metadata=_info
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
return docs
|
|
87
|
+
else:
|
|
88
|
+
return []
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from langchain.docstore.document import Document
|
|
8
|
+
from .basepdf import BasePDF
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PDFTablesLoader(BasePDF):
|
|
12
|
+
"""
|
|
13
|
+
Loader for Tables in PDF Files.
|
|
14
|
+
"""
|
|
15
|
+
_extension = ['.pdf']
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
table_settings: dict = {},
|
|
25
|
+
**kwargs
|
|
26
|
+
):
|
|
27
|
+
super().__init__(
|
|
28
|
+
path,
|
|
29
|
+
tokenizer,
|
|
30
|
+
text_splitter,
|
|
31
|
+
source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
**kwargs
|
|
34
|
+
)
|
|
35
|
+
# Table Settings:
|
|
36
|
+
self.table_settings = {
|
|
37
|
+
#"vertical_strategy": "text",
|
|
38
|
+
# "horizontal_strategy": "text",
|
|
39
|
+
"intersection_x_tolerance": 5,
|
|
40
|
+
"intersection_y_tolerance": 5
|
|
41
|
+
}
|
|
42
|
+
if table_settings:
|
|
43
|
+
self.table_settings.update(table_settings)
|
|
44
|
+
self._skiprows = kwargs.pop('skiprows', None)
|
|
45
|
+
|
|
46
|
+
def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
47
|
+
"""
|
|
48
|
+
Rename duplicate columns in the DataFrame to ensure they are unique.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
df (pd.DataFrame): The DataFrame with potential duplicate column names.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
pd.DataFrame: A DataFrame with unique column names.
|
|
55
|
+
"""
|
|
56
|
+
seen = {}
|
|
57
|
+
new_columns = []
|
|
58
|
+
for col in df.columns:
|
|
59
|
+
new_col = col
|
|
60
|
+
count = seen.get(col, 0)
|
|
61
|
+
while new_col in new_columns:
|
|
62
|
+
count += 1
|
|
63
|
+
new_col = f"{col}_{count}"
|
|
64
|
+
new_columns.append(new_col)
|
|
65
|
+
seen[col] = count
|
|
66
|
+
df.columns = new_columns
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
def get_markdown(self, df: pd.DataFrame) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Convert a DataFrame to a Markdown string.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
df (pd.DataFrame): The DataFrame to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
str: The JSON string.
|
|
78
|
+
"""
|
|
79
|
+
buffer = StringIO()
|
|
80
|
+
df = self.unique_columns(df)
|
|
81
|
+
df.to_markdown(buffer)
|
|
82
|
+
buffer.seek(0)
|
|
83
|
+
return buffer.getvalue()
|
|
84
|
+
|
|
85
|
+
def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
|
|
86
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
|
87
|
+
df = df.dropna(axis=1, how='all')
|
|
88
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
89
|
+
page = page_number + 1
|
|
90
|
+
table_meta = {
|
|
91
|
+
"url": '',
|
|
92
|
+
"source": f"{path.name} Page.#{page} Table.#{table_idx}",
|
|
93
|
+
"filename": path.name,
|
|
94
|
+
"index": f"{path.name}:Table:{table_idx}",
|
|
95
|
+
"question": '',
|
|
96
|
+
"answer": '',
|
|
97
|
+
"type": 'table',
|
|
98
|
+
"data": {},
|
|
99
|
+
"summary": '',
|
|
100
|
+
"document_meta": {
|
|
101
|
+
"table_index": table_idx,
|
|
102
|
+
"table_shape": df.shape,
|
|
103
|
+
"table_columns": df.columns.tolist(),
|
|
104
|
+
"description": f"Extracted from Page.#{page}."
|
|
105
|
+
},
|
|
106
|
+
"source_type": self._source_type
|
|
107
|
+
}
|
|
108
|
+
return df, table_meta
|
|
109
|
+
|
|
110
|
+
def _load_pdf(self, path: Path) -> list:
|
|
111
|
+
"""
|
|
112
|
+
Load a PDF file using the Fitz library.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
path (Path): The path to the PDF file.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
list: A list of Langchain Documents.
|
|
119
|
+
"""
|
|
120
|
+
if self._check_path(path):
|
|
121
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
122
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
123
|
+
docs = []
|
|
124
|
+
for page_number in range(pdf.page_count):
|
|
125
|
+
page = pdf[page_number]
|
|
126
|
+
try:
|
|
127
|
+
tabs = page.find_tables(**self.table_settings)
|
|
128
|
+
for tab_idx, tab in enumerate(tabs):
|
|
129
|
+
df, _meta = self.parse_table(tab_idx, tab, page_number, path)
|
|
130
|
+
## Sample information:
|
|
131
|
+
print('::: Printing Table Information === ')
|
|
132
|
+
print(df)
|
|
133
|
+
print("::: Printing Column Information === ")
|
|
134
|
+
for column, t in df.dtypes.items():
|
|
135
|
+
print(column, "->", t, "->", df[column].iloc[0])
|
|
136
|
+
# convert into markdown:
|
|
137
|
+
txt = df.to_markdown()
|
|
138
|
+
if txt:
|
|
139
|
+
docs.append(
|
|
140
|
+
Document(page_content=txt, metadata=_meta)
|
|
141
|
+
)
|
|
142
|
+
except Exception as exc:
|
|
143
|
+
print(exc)
|
|
144
|
+
continue
|
|
145
|
+
return docs
|
parrot/loaders/ppt.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_community.document_loaders import (
|
|
3
|
+
UnstructuredPowerPointLoader
|
|
4
|
+
)
|
|
5
|
+
from .abstract import AbstractLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PPTXLoader(AbstractLoader):
|
|
9
|
+
"""
|
|
10
|
+
Loader for PPTX files.
|
|
11
|
+
"""
|
|
12
|
+
_extension: list = ['.pptx']
|
|
13
|
+
|
|
14
|
+
def load(self, path: PurePath) -> list:
|
|
15
|
+
if self._check_path(path):
|
|
16
|
+
docs = []
|
|
17
|
+
self.logger.info(f"Loading PPTX file: {path}")
|
|
18
|
+
ppt_loader = UnstructuredPowerPointLoader(
|
|
19
|
+
file_path=str(path)
|
|
20
|
+
)
|
|
21
|
+
docs += ppt_loader.load()
|
|
22
|
+
for doc in docs:
|
|
23
|
+
doc.metadata['source_type'] = self._source_type
|
|
24
|
+
# Split the documents into chunks
|
|
25
|
+
return self.split_documents(docs)
|
|
26
|
+
else:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
def parse(self, source):
|
|
30
|
+
pass
|
parrot/loaders/qa.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from .abstract import AbstractLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QAFileLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Question and Answers File based on Excel.
|
|
13
|
+
"""
|
|
14
|
+
_extension = ['.xlsx']
|
|
15
|
+
chunk_size = 768
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'QA',
|
|
23
|
+
columns: list = ['Question', 'Answer'],
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
27
|
+
self.path = path
|
|
28
|
+
self._columns = columns
|
|
29
|
+
if isinstance(path, str):
|
|
30
|
+
self.path = Path(path).resolve()
|
|
31
|
+
if self.path.is_dir():
|
|
32
|
+
raise ValueError(
|
|
33
|
+
f"Currently only accepting single Files."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def _load_document(self, path: PurePath) -> list:
|
|
37
|
+
if path.exists():
|
|
38
|
+
print('Load QA Excel File: ', path)
|
|
39
|
+
df = pd.read_excel(path)
|
|
40
|
+
q = self._columns[0]
|
|
41
|
+
a = self._columns[1]
|
|
42
|
+
docs = []
|
|
43
|
+
for idx, row in df.iterrows():
|
|
44
|
+
# Question Document
|
|
45
|
+
doc = Document(
|
|
46
|
+
page_content=f"**Question:** {row[q]}: **Answer:** {row[a]}",
|
|
47
|
+
metadata={
|
|
48
|
+
"url": '',
|
|
49
|
+
"index": f"{path.name} #{idx}",
|
|
50
|
+
"source": f"{path.name} Row.#{idx}",
|
|
51
|
+
"filename": f"{path.name}",
|
|
52
|
+
"question": row[q],
|
|
53
|
+
"answer": row[a],
|
|
54
|
+
"page_number": idx,
|
|
55
|
+
"source_type": self._source_type,
|
|
56
|
+
"type": "QA",
|
|
57
|
+
"summary": f"Question: {row[q]}?: **{row[a]}**",
|
|
58
|
+
"document_meta": {
|
|
59
|
+
"question": row[q],
|
|
60
|
+
"answer": row[a],
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
docs.append(doc)
|
|
65
|
+
return docs
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
def load(self, **kwargs) -> list:
|
|
69
|
+
"""
|
|
70
|
+
Load Chapters from a PDF file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
list: A list of Langchain Documents.
|
|
74
|
+
"""
|
|
75
|
+
if self.path.is_file():
|
|
76
|
+
documents = self._load_document(path=self.path, **kwargs)
|
|
77
|
+
# after all documents are retrieved, procesed and stored
|
|
78
|
+
return self.split_documents(documents)
|
|
79
|
+
|
|
80
|
+
def parse(self, source):
|
|
81
|
+
pass
|
parrot/loaders/repo.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_core.document_loaders.blob_loaders import Blob
|
|
3
|
+
from langchain_community.document_loaders.generic import GenericLoader
|
|
4
|
+
from langchain_community.document_loaders.parsers import LanguageParser
|
|
5
|
+
from langchain_community.document_loaders import (
|
|
6
|
+
DirectoryLoader,
|
|
7
|
+
TextLoader,
|
|
8
|
+
JSONLoader
|
|
9
|
+
)
|
|
10
|
+
from langchain_text_splitters import Language
|
|
11
|
+
from langchain.text_splitter import (
|
|
12
|
+
RecursiveCharacterTextSplitter
|
|
13
|
+
)
|
|
14
|
+
from .abstract import AbstractLoader
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RepositoryLoader(AbstractLoader):
|
|
18
|
+
"""Repository (Code Directory) loader.
|
|
19
|
+
"""
|
|
20
|
+
exclude_paths: list = [
|
|
21
|
+
".venv/**",
|
|
22
|
+
".venv/**/**/*",
|
|
23
|
+
".git/**",
|
|
24
|
+
"node_modules/**",
|
|
25
|
+
"build/**",
|
|
26
|
+
"dist/**",
|
|
27
|
+
"templates/**",
|
|
28
|
+
"tmp/**"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
def load(self, path: PurePath, lang: str = 'python', excludes: list = []) -> list:
|
|
32
|
+
"""
|
|
33
|
+
Load data from a repository and return it as a Langchain Document.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(path, str):
|
|
36
|
+
path = PurePath(path)
|
|
37
|
+
if excludes:
|
|
38
|
+
self.exclude_paths += excludes
|
|
39
|
+
excludes_path = [
|
|
40
|
+
str(path.joinpath(p).resolve()) for p in self.exclude_paths
|
|
41
|
+
]
|
|
42
|
+
if lang == 'python':
|
|
43
|
+
parser = LanguageParser(language=Language.PYTHON, parser_threshold=100)
|
|
44
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
45
|
+
language=Language.PYTHON, chunk_size=1024, chunk_overlap=200
|
|
46
|
+
)
|
|
47
|
+
suffixes = [".py", ".pyx"]
|
|
48
|
+
glob = "**/[!.]*.py?"
|
|
49
|
+
elif lang == 'javascript':
|
|
50
|
+
parser = LanguageParser(language=Language.JS, parser_threshold=100)
|
|
51
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
52
|
+
language=Language.JS, chunk_size=1024, chunk_overlap=200
|
|
53
|
+
)
|
|
54
|
+
suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
55
|
+
elif lang == 'typescript':
|
|
56
|
+
parser = LanguageParser(language=Language.TS, parser_threshold=100)
|
|
57
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
58
|
+
language=Language.TS, chunk_size=1024, chunk_overlap=200
|
|
59
|
+
)
|
|
60
|
+
suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
61
|
+
elif lang == 'json':
|
|
62
|
+
loader = DirectoryLoader(
|
|
63
|
+
path,
|
|
64
|
+
glob="**/*.json",
|
|
65
|
+
show_progress=True,
|
|
66
|
+
exclude=excludes_path,
|
|
67
|
+
silent_errors=True,
|
|
68
|
+
recursive=True,
|
|
69
|
+
# loader_cls=TextLoader,
|
|
70
|
+
loader_cls=JSONLoader,
|
|
71
|
+
loader_kwargs={
|
|
72
|
+
'jq_schema': '.',
|
|
73
|
+
'text_content': False
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
docs = loader.load()
|
|
77
|
+
for doc in docs:
|
|
78
|
+
doc.metadata['url'] = ''
|
|
79
|
+
doc.metadata['source_type'] = self._source_type
|
|
80
|
+
doc.metadata['language'] = lang
|
|
81
|
+
return self.text_splitter.split_documents(docs)
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Language {lang} not supported for Repository"
|
|
85
|
+
)
|
|
86
|
+
loader = GenericLoader.from_filesystem(
|
|
87
|
+
path,
|
|
88
|
+
glob=glob,
|
|
89
|
+
suffixes=suffixes,
|
|
90
|
+
exclude=self.exclude_paths,
|
|
91
|
+
parser=parser,
|
|
92
|
+
show_progress=True
|
|
93
|
+
)
|
|
94
|
+
docs = loader.load()
|
|
95
|
+
for doc in docs:
|
|
96
|
+
doc.metadata['url'] = ''
|
|
97
|
+
doc.metadata['source_type'] = self._source_type
|
|
98
|
+
doc.metadata['language'] = lang
|
|
99
|
+
documents = splitter.split_documents(docs)
|
|
100
|
+
return documents
|
|
101
|
+
|
|
102
|
+
def parse(self, source):
|
|
103
|
+
raise NotImplementedError("Parser method is not implemented for PDFLoader.")
|