ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/vimeo.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from transformers import pipeline
|
|
3
|
+
import torch
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from .youtube import YoutubeLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VimeoLoader(YoutubeLoader):
|
|
9
|
+
"""
|
|
10
|
+
Loader for Vimeo videos.
|
|
11
|
+
"""
|
|
12
|
+
def load_video(self, url: str, video_title: str, transcript: Optional[Union[str, None]] = None) -> list:
|
|
13
|
+
metadata = {
|
|
14
|
+
"source": url,
|
|
15
|
+
"url": url,
|
|
16
|
+
"index": url,
|
|
17
|
+
"filename": video_title,
|
|
18
|
+
"question": '',
|
|
19
|
+
"answer": '',
|
|
20
|
+
'type': 'video_transcript',
|
|
21
|
+
"source_type": self._source_type,
|
|
22
|
+
"summary": '',
|
|
23
|
+
"document_meta": {
|
|
24
|
+
"language": self._language,
|
|
25
|
+
"title": video_title,
|
|
26
|
+
"topic_tags": ""
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
if self.topics:
|
|
30
|
+
metadata['document_meta']['topic_tags'] = self.topics
|
|
31
|
+
if transcript is None:
|
|
32
|
+
documents = []
|
|
33
|
+
docs = []
|
|
34
|
+
# first: download video
|
|
35
|
+
try:
|
|
36
|
+
file_path = self.download_video(url, self._video_path)
|
|
37
|
+
except Exception:
|
|
38
|
+
return []
|
|
39
|
+
if not file_path:
|
|
40
|
+
self.logger.warning(
|
|
41
|
+
f"Error downloading File for video: {self._video_path}"
|
|
42
|
+
)
|
|
43
|
+
return []
|
|
44
|
+
transcript_path = file_path.with_suffix('.vtt')
|
|
45
|
+
audio_path = file_path.with_suffix('.mp3')
|
|
46
|
+
# second: extract audio
|
|
47
|
+
self.extract_audio(file_path, audio_path)
|
|
48
|
+
# get the Whisper parser
|
|
49
|
+
transcript_whisper = self.get_whisper_transcript(audio_path)
|
|
50
|
+
if transcript_whisper:
|
|
51
|
+
transcript = transcript_whisper['text']
|
|
52
|
+
else:
|
|
53
|
+
transcript = ''
|
|
54
|
+
# Summarize the transcript
|
|
55
|
+
if transcript:
|
|
56
|
+
summary = self.get_summary_from_text(transcript)
|
|
57
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
58
|
+
metadata['summary'] = summary
|
|
59
|
+
doc = Document(
|
|
60
|
+
page_content=transcript,
|
|
61
|
+
metadata=metadata
|
|
62
|
+
)
|
|
63
|
+
documents.append(doc)
|
|
64
|
+
if transcript_whisper:
|
|
65
|
+
# VTT version:
|
|
66
|
+
transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
|
|
67
|
+
doc = Document(
|
|
68
|
+
page_content=transcript,
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
documents.append(doc)
|
|
72
|
+
# Saving every dialog chunk as a separate document
|
|
73
|
+
dialogs = self.transcript_to_blocks(transcript_whisper)
|
|
74
|
+
for chunk in dialogs:
|
|
75
|
+
_meta = {
|
|
76
|
+
"index": f"{video_title}:{chunk['id']}",
|
|
77
|
+
"document_meta": {
|
|
78
|
+
"start": f"{chunk['start_time']}",
|
|
79
|
+
"end": f"{chunk['end_time']}",
|
|
80
|
+
"id": f"{chunk['id']}",
|
|
81
|
+
"language": self._language,
|
|
82
|
+
"title": video_title,
|
|
83
|
+
"topic_tags": ""
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
_info = {**metadata, **_meta}
|
|
87
|
+
doc = Document(
|
|
88
|
+
page_content=chunk['text'],
|
|
89
|
+
metadata=_info
|
|
90
|
+
)
|
|
91
|
+
docs.append(doc)
|
|
92
|
+
documents.extend(docs)
|
|
93
|
+
return self.split_documents(documents)
|
|
94
|
+
else:
|
|
95
|
+
# using the transcript file
|
|
96
|
+
with open(transcript, 'r') as f:
|
|
97
|
+
transcript = f.read()
|
|
98
|
+
summary = self.get_summary_from_text(transcript)
|
|
99
|
+
transcript_whisper = None
|
|
100
|
+
metadata['summary'] = f"{summary!s}"
|
|
101
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
102
|
+
doc = Document(
|
|
103
|
+
page_content=transcript,
|
|
104
|
+
metadata=metadata
|
|
105
|
+
)
|
|
106
|
+
return self.split_documents([doc])
|
parrot/loaders/web.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from markdownify import MarkdownConverter
|
|
3
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
|
4
|
+
from selenium import webdriver
|
|
5
|
+
from selenium.webdriver.chrome.service import Service
|
|
6
|
+
from selenium.webdriver.chrome.options import Options
|
|
7
|
+
from selenium.webdriver.common.by import By
|
|
8
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
9
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
10
|
+
from langchain.docstore.document import Document
|
|
11
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
12
|
+
from navconfig.logging import logging
|
|
13
|
+
from .abstract import AbstractLoader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logging.getLogger(name='selenium.webdriver').setLevel(logging.WARNING)
|
|
17
|
+
logging.getLogger(name='WDM').setLevel(logging.WARNING)
|
|
18
|
+
logging.getLogger(name='matplotlib').setLevel(logging.WARNING)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class WebLoader(AbstractLoader):
|
|
22
|
+
"""Class to load web pages and extract text."""
|
|
23
|
+
chrome_options = [
|
|
24
|
+
"--headless",
|
|
25
|
+
"--enable-automation",
|
|
26
|
+
"--lang=en",
|
|
27
|
+
"--disable-extensions",
|
|
28
|
+
"--disable-gpu",
|
|
29
|
+
"--no-sandbox",
|
|
30
|
+
"--disable-features=NetworkService",
|
|
31
|
+
"--disable-dev-shm-usage",
|
|
32
|
+
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
|
|
33
|
+
]
|
|
34
|
+
def __init__(self, urls: dict, source_type: str = 'website', **kwargs):
|
|
35
|
+
self.urls = urls
|
|
36
|
+
self._source_type = source_type
|
|
37
|
+
self._options = Options()
|
|
38
|
+
self.timeout: int = kwargs.pop('timeout', 60)
|
|
39
|
+
for option in self.chrome_options:
|
|
40
|
+
self._options.add_argument(option)
|
|
41
|
+
self.driver = webdriver.Chrome(
|
|
42
|
+
service=Service(ChromeDriverManager().install()),
|
|
43
|
+
options=self._options
|
|
44
|
+
)
|
|
45
|
+
self._mark_splitter = MarkdownTextSplitter(
|
|
46
|
+
chunk_size = 1024,
|
|
47
|
+
chunk_overlap=10
|
|
48
|
+
)
|
|
49
|
+
super().__init__(source_type=source_type, **kwargs)
|
|
50
|
+
|
|
51
|
+
def md(self, soup, **options):
|
|
52
|
+
return MarkdownConverter(**options).convert_soup(soup)
|
|
53
|
+
|
|
54
|
+
def clean_html(self, html, tags, objects=[]):
|
|
55
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
56
|
+
page_title = soup.title.string
|
|
57
|
+
md_text = self.md(soup)
|
|
58
|
+
# Remove script and style elements
|
|
59
|
+
for script_or_style in soup(["script", "style", "link"]):
|
|
60
|
+
script_or_style.decompose()
|
|
61
|
+
# Extract Content
|
|
62
|
+
content = []
|
|
63
|
+
paragraphs = [' '.join(p.get_text().split()) for p in soup.find_all(tags)]
|
|
64
|
+
# Look for iframe elements and format their src attributes into readable strings
|
|
65
|
+
iframes = soup.find_all('iframe')
|
|
66
|
+
for iframe in iframes:
|
|
67
|
+
video_src = iframe.get('src', '')
|
|
68
|
+
# You might want to customize the formatting of this string
|
|
69
|
+
formatted_video = f"Video Link: {video_src}" if video_src else ""
|
|
70
|
+
content.append(formatted_video)
|
|
71
|
+
if objects:
|
|
72
|
+
for obj in objects:
|
|
73
|
+
(element, args), = obj.items()
|
|
74
|
+
if 'parse_list' in args:
|
|
75
|
+
parse_list = args.pop('parse_list')
|
|
76
|
+
# Find the element container:
|
|
77
|
+
container = soup.find(element, attrs=args)
|
|
78
|
+
# Parse list of objects (UL, LI)
|
|
79
|
+
name_type = parse_list.pop('type')
|
|
80
|
+
params = parse_list.get('find')
|
|
81
|
+
el = params.pop(0)
|
|
82
|
+
try:
|
|
83
|
+
attrs = params.pop(0)
|
|
84
|
+
except IndexError:
|
|
85
|
+
attrs = {}
|
|
86
|
+
elements = container.find_all(el, attrs=attrs)
|
|
87
|
+
structured_text = ''
|
|
88
|
+
for element in elements:
|
|
89
|
+
title = element.find('span', class_='title').get_text(strip=True)
|
|
90
|
+
lists = element.find_all('ul')
|
|
91
|
+
if lists:
|
|
92
|
+
structured_text += f"\nCategory: {title}\n{name_type}:\n"
|
|
93
|
+
for ul in lists:
|
|
94
|
+
items = [f"- {li.get_text(strip=True)}" for li in ul.select('li')]
|
|
95
|
+
formatted_list = '\n'.join(items)
|
|
96
|
+
structured_text += formatted_list
|
|
97
|
+
structured_text += "\n"
|
|
98
|
+
content.append(structured_text)
|
|
99
|
+
else:
|
|
100
|
+
elements = soup.find_all(element, attrs=args)
|
|
101
|
+
for element in elements:
|
|
102
|
+
# Handle <a> tags within the current element
|
|
103
|
+
links = element.find_all('a')
|
|
104
|
+
for link in links:
|
|
105
|
+
# Extract link text and href, format them into a readable string
|
|
106
|
+
link_text = link.get_text(strip=True)
|
|
107
|
+
href = link.get('href', '')
|
|
108
|
+
formatted_link = (
|
|
109
|
+
f"{link_text} (Link: {href})"
|
|
110
|
+
if href
|
|
111
|
+
else link_text
|
|
112
|
+
)
|
|
113
|
+
# Replace the original link text in the element
|
|
114
|
+
# with the formatted version
|
|
115
|
+
link.replace_with(formatted_link)
|
|
116
|
+
# work with UL lists:
|
|
117
|
+
lists = element.find_all('ul')
|
|
118
|
+
for ul in lists:
|
|
119
|
+
items = [li.get_text(strip=True) for li in ul.select('li')]
|
|
120
|
+
formatted_list = '\n'.join(items)
|
|
121
|
+
content.append(formatted_list)
|
|
122
|
+
cleaned_text = ' '.join(element.get_text().split())
|
|
123
|
+
content.append(cleaned_text)
|
|
124
|
+
return (content + paragraphs, md_text, page_title)
|
|
125
|
+
|
|
126
|
+
def get(self, address: dict) -> list:
|
|
127
|
+
(url, args), = address.items()
|
|
128
|
+
self.logger.info(
|
|
129
|
+
f'Downloading URL {url} with args {args}'
|
|
130
|
+
)
|
|
131
|
+
locator = args.get('locator', (By.TAG_NAME, 'body'))
|
|
132
|
+
wait = WebDriverWait(self.driver, self.timeout)
|
|
133
|
+
acookies = args.get('accept_cookies', False)
|
|
134
|
+
try:
|
|
135
|
+
self.driver.get(url)
|
|
136
|
+
# After loading page, accept cookies
|
|
137
|
+
wait.until(
|
|
138
|
+
EC.presence_of_element_located(
|
|
139
|
+
locator
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
if acookies:
|
|
143
|
+
btn = wait.until(
|
|
144
|
+
EC.element_to_be_clickable(
|
|
145
|
+
acookies
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
btn.click()
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
print(f"Failed to Get {url}: {exc}")
|
|
151
|
+
self.logger.exception(
|
|
152
|
+
str(exc), stack_info=True
|
|
153
|
+
)
|
|
154
|
+
raise
|
|
155
|
+
try:
|
|
156
|
+
extract = args.get('tags', ['p', 'title', 'h1', 'h2', 'section', 'article'])
|
|
157
|
+
objects = args.get('objects', [])
|
|
158
|
+
source_type = args.get('source_type', self._source_type)
|
|
159
|
+
html_content = self.driver.page_source
|
|
160
|
+
content, md_text, page_title = self.clean_html(
|
|
161
|
+
html_content,
|
|
162
|
+
extract,
|
|
163
|
+
objects
|
|
164
|
+
)
|
|
165
|
+
metadata = {
|
|
166
|
+
"source": url,
|
|
167
|
+
"index": page_title,
|
|
168
|
+
"url": url,
|
|
169
|
+
"filename": page_title,
|
|
170
|
+
"question": '',
|
|
171
|
+
"answer": '',
|
|
172
|
+
"source_type": source_type,
|
|
173
|
+
'type': 'webpage',
|
|
174
|
+
'summary': '',
|
|
175
|
+
"document_meta": {
|
|
176
|
+
"language": "en",
|
|
177
|
+
"title": page_title,
|
|
178
|
+
},
|
|
179
|
+
}
|
|
180
|
+
docs = []
|
|
181
|
+
if md_text:
|
|
182
|
+
docs.append(
|
|
183
|
+
Document(
|
|
184
|
+
page_content=md_text,
|
|
185
|
+
metadata=metadata
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
# for chunk in self._mark_splitter.split_text(md_text):
|
|
189
|
+
# docs.append(
|
|
190
|
+
# Document(
|
|
191
|
+
# page_content=chunk,
|
|
192
|
+
# metadata=metadata
|
|
193
|
+
# )
|
|
194
|
+
# )
|
|
195
|
+
if content:
|
|
196
|
+
site_content = [
|
|
197
|
+
Document(
|
|
198
|
+
page_content=paragraph,
|
|
199
|
+
metadata=metadata
|
|
200
|
+
) for paragraph in content
|
|
201
|
+
]
|
|
202
|
+
return docs + site_content
|
|
203
|
+
except Exception as exc:
|
|
204
|
+
print(f"Failed to load {url}: {exc}")
|
|
205
|
+
|
|
206
|
+
def load(self, **kwargs) -> list:
|
|
207
|
+
documents = []
|
|
208
|
+
for address in self.urls:
|
|
209
|
+
docs = self.get(address)
|
|
210
|
+
if docs:
|
|
211
|
+
documents.extend(docs)
|
|
212
|
+
self.driver.quit()
|
|
213
|
+
return self.split_by_tokens(documents)
|
|
214
|
+
|
|
215
|
+
def parse(self, source):
|
|
216
|
+
pass
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import aiohttp
|
|
3
|
+
from markdownify import MarkdownConverter
|
|
4
|
+
from bs4 import BeautifulSoup as bs
|
|
5
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from .abstract import AbstractLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebBaseLoader(AbstractLoader):
|
|
11
|
+
"""Class to load web pages and extract text as Markdown."""
|
|
12
|
+
def __init__(self, urls: dict, source_type: str = 'website', **kwargs):
|
|
13
|
+
self.urls = urls
|
|
14
|
+
self._source_type = source_type
|
|
15
|
+
self.timeout: int = kwargs.pop('timeout', 60)
|
|
16
|
+
self._wait: int = kwargs.pop('wait', 60)
|
|
17
|
+
super().__init__(source_type=source_type, **kwargs)
|
|
18
|
+
self.md_splitter = MarkdownTextSplitter(
|
|
19
|
+
chunk_size = 1024,
|
|
20
|
+
chunk_overlap=10
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def md(self, soup, **options):
|
|
24
|
+
return MarkdownConverter(**options).convert_soup(soup)
|
|
25
|
+
|
|
26
|
+
async def _fetch(
|
|
27
|
+
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
|
28
|
+
) -> str:
|
|
29
|
+
async with aiohttp.ClientSession() as session:
|
|
30
|
+
for i in range(retries):
|
|
31
|
+
try:
|
|
32
|
+
async with session.get(url, allow_redirects=True) as response:
|
|
33
|
+
if response.status == 200:
|
|
34
|
+
return await response.text()
|
|
35
|
+
elif response.status == 429:
|
|
36
|
+
# Too many requests
|
|
37
|
+
if i == retries - 1:
|
|
38
|
+
return ''
|
|
39
|
+
self.logger.warning(
|
|
40
|
+
f"Too many requests to {url}. Waiting {self._wait} seconds."
|
|
41
|
+
)
|
|
42
|
+
await asyncio.sleep(self._wait)
|
|
43
|
+
else: # Other non-success status codes
|
|
44
|
+
response.raise_for_status() # Raise for other errors
|
|
45
|
+
except aiohttp.ClientConnectionError as e:
|
|
46
|
+
if i == retries - 1:
|
|
47
|
+
raise
|
|
48
|
+
else:
|
|
49
|
+
self.logger.warning(
|
|
50
|
+
f"Error fetching {url} with attempt "
|
|
51
|
+
f"{i + 1}/{retries}: {e}. Retrying..."
|
|
52
|
+
)
|
|
53
|
+
await asyncio.sleep(cooldown * backoff**i)
|
|
54
|
+
except aiohttp.ClientResponseError as e:
|
|
55
|
+
self.logger.warning(
|
|
56
|
+
f"Request failed (ClientResponseError): {e}"
|
|
57
|
+
)
|
|
58
|
+
return ''
|
|
59
|
+
self.logger.warning(f"Failed to fetch {url} after {retries} attempts.")
|
|
60
|
+
return ''
|
|
61
|
+
|
|
62
|
+
async def get_address(self, url: str) -> list:
|
|
63
|
+
self.logger.info(
|
|
64
|
+
f'Downloading URL {url}'
|
|
65
|
+
)
|
|
66
|
+
html = await self._fetch(url)
|
|
67
|
+
docs = []
|
|
68
|
+
if html:
|
|
69
|
+
soup = bs(html, 'html.parser')
|
|
70
|
+
md_text = self.md(soup)
|
|
71
|
+
try:
|
|
72
|
+
title = soup.title.string
|
|
73
|
+
except AttributeError:
|
|
74
|
+
title = None
|
|
75
|
+
metadata = {
|
|
76
|
+
"url": url,
|
|
77
|
+
"source": url,
|
|
78
|
+
"index": "",
|
|
79
|
+
"filename": '',
|
|
80
|
+
"type": 'webpage',
|
|
81
|
+
"question": '',
|
|
82
|
+
"answer": '',
|
|
83
|
+
"source_type": self._source_type,
|
|
84
|
+
"summary": "",
|
|
85
|
+
"document_meta": {
|
|
86
|
+
"title": title,
|
|
87
|
+
"description": soup.find('meta', attrs={'name': 'description'})['content']
|
|
88
|
+
if soup.find('meta', attrs={'name': 'description'}) else '',
|
|
89
|
+
"keywords": soup.find('meta', attrs={'name': 'keywords'})['content']
|
|
90
|
+
if soup.find('meta', attrs={'name': 'keywords'}) else '',
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
for chunk in self.md_splitter.split_text(md_text):
|
|
94
|
+
docs.append(
|
|
95
|
+
Document(
|
|
96
|
+
page_content=chunk,
|
|
97
|
+
metadata=metadata
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
return docs
|
|
101
|
+
|
|
102
|
+
async def load(self, **kwargs) -> list:
|
|
103
|
+
documents = []
|
|
104
|
+
if self.urls is None:
|
|
105
|
+
return []
|
|
106
|
+
for address in self.urls:
|
|
107
|
+
docs = await self.get_address(address)
|
|
108
|
+
documents.extend(docs)
|
|
109
|
+
return self.split_documents(documents)
|
|
110
|
+
|
|
111
|
+
def parse(self, source):
|
|
112
|
+
pass
|
parrot/loaders/word.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Union
|
|
4
|
+
import mammoth
|
|
5
|
+
import docx
|
|
6
|
+
from markdownify import markdownify as md
|
|
7
|
+
from langchain_community.document_loaders.word_document import (
|
|
8
|
+
UnstructuredWordDocumentLoader
|
|
9
|
+
)
|
|
10
|
+
from langchain.text_splitter import MarkdownTextSplitter
|
|
11
|
+
from langchain.docstore.document import Document
|
|
12
|
+
from .abstract import AbstractLoader
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MSWordLoader(AbstractLoader):
|
|
16
|
+
"""
|
|
17
|
+
Loader for Microsoft Word files.
|
|
18
|
+
"""
|
|
19
|
+
_extension: list = ['.docx', '.doc']
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
path: PurePath,
|
|
24
|
+
tokenizer: Union[str, Callable] = None,
|
|
25
|
+
text_splitter: Union[str, Callable] = None,
|
|
26
|
+
source_type: str = 'document',
|
|
27
|
+
**kwargs
|
|
28
|
+
):
|
|
29
|
+
self.path = path
|
|
30
|
+
if isinstance(path, str):
|
|
31
|
+
path = Path(path)
|
|
32
|
+
self._use_mammoth: bool = kwargs.pop('use_mammoth', True)
|
|
33
|
+
self._md_splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
|
|
34
|
+
super().__init__(
|
|
35
|
+
tokenizer=tokenizer,
|
|
36
|
+
text_splitter=text_splitter,
|
|
37
|
+
source_type=source_type,
|
|
38
|
+
**kwargs
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def _load_document(self, path: PurePath) -> list:
|
|
42
|
+
if self._check_path(path):
|
|
43
|
+
docs = []
|
|
44
|
+
self.logger.info(f"Loading Word file: {path}")
|
|
45
|
+
if self._use_mammoth:
|
|
46
|
+
with open(path, "rb") as docx_file:
|
|
47
|
+
doc = docx.Document(str(path))
|
|
48
|
+
prop = doc.core_properties
|
|
49
|
+
result = mammoth.convert_to_html(docx_file)
|
|
50
|
+
# result = mammoth.extract_raw_text(docx_file)
|
|
51
|
+
html = result.value # The generated HTML
|
|
52
|
+
md_text = md(html) # The generated Markdown
|
|
53
|
+
try:
|
|
54
|
+
summary = self.get_summary_from_text(md_text)
|
|
55
|
+
except ValueError:
|
|
56
|
+
summary = ''
|
|
57
|
+
metadata = {
|
|
58
|
+
"url": '',
|
|
59
|
+
"source": path.name,
|
|
60
|
+
"filename": path.name,
|
|
61
|
+
"index": str(path.name),
|
|
62
|
+
"type": 'document',
|
|
63
|
+
"question": '',
|
|
64
|
+
"answer": '',
|
|
65
|
+
"source_type": self._source_type,
|
|
66
|
+
"data": {},
|
|
67
|
+
"summary": summary,
|
|
68
|
+
"document_meta": {
|
|
69
|
+
"author": prop.author,
|
|
70
|
+
"version": prop.version,
|
|
71
|
+
"title": prop.title,
|
|
72
|
+
"created": prop.created.strftime("%Y-%m-%d %H:%M:%S"),
|
|
73
|
+
"last_modified": prop.modified.strftime("%Y-%m-%d %H:%M:%S")
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
for chunk in self._md_splitter.split_text(md_text):
|
|
77
|
+
_idx = {
|
|
78
|
+
**metadata
|
|
79
|
+
}
|
|
80
|
+
docs.append(
|
|
81
|
+
Document(
|
|
82
|
+
page_content=chunk,
|
|
83
|
+
metadata=_idx
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
return docs
|
|
87
|
+
else:
|
|
88
|
+
word_loader = UnstructuredWordDocumentLoader(
|
|
89
|
+
file_path=str(path)
|
|
90
|
+
)
|
|
91
|
+
docs = word_loader.load()
|
|
92
|
+
for doc in docs:
|
|
93
|
+
# Fix This
|
|
94
|
+
doc.metadata['source_type'] = self._source_type
|
|
95
|
+
return []
|
|
96
|
+
|
|
97
|
+
def load(self) -> list:
|
|
98
|
+
"""
|
|
99
|
+
Load data from a DOCX file.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
source (str): The path to the DOCX file.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
list: A list of Langchain Documents.
|
|
106
|
+
"""
|
|
107
|
+
if not self.path.exists():
|
|
108
|
+
raise FileNotFoundError(f"DOCX file/directory not found: {self.path}")
|
|
109
|
+
if self.path.is_dir():
|
|
110
|
+
documents = []
|
|
111
|
+
# iterate over the files in the directory
|
|
112
|
+
for ext in self._extension:
|
|
113
|
+
for item in self.path.glob(f'*{ext}'):
|
|
114
|
+
documents.extend(self._load_document(item))
|
|
115
|
+
elif self.path.is_file():
|
|
116
|
+
documents = self._load_document(self.path)
|
|
117
|
+
else:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"DOCX Loader: Invalid path: {self.path}"
|
|
120
|
+
)
|
|
121
|
+
# return documents
|
|
122
|
+
return self.split_documents(documents)
|
|
123
|
+
|
|
124
|
+
def parse(self, source):
|
|
125
|
+
pass
|