ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.1.0.dist-info/LICENSE +21 -0
- ai_parrot-0.1.0.dist-info/METADATA +299 -0
- ai_parrot-0.1.0.dist-info/RECORD +108 -0
- ai_parrot-0.1.0.dist-info/WHEEL +5 -0
- ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
- parrot/__init__.py +18 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +965 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +257 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +100 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/oddie.py +17 -0
- parrot/chatbots/retrievals/__init__.py +515 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +108 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +169 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +0 -0
- parrot/llms/abstract.py +41 -0
- parrot/llms/anthropic.py +36 -0
- parrot/llms/google.py +37 -0
- parrot/llms/groq.py +33 -0
- parrot/llms/hf.py +39 -0
- parrot/llms/openai.py +49 -0
- parrot/llms/pipes.py +103 -0
- parrot/llms/vertex.py +68 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +187 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +107 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +152 -0
- parrot/models.py +347 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +0 -0
- parrot/stores/abstract.py +170 -0
- parrot/stores/milvus.py +540 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +16 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Union, List
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from moviepy.editor import VideoFileClip
|
|
6
|
+
from transformers import (
|
|
7
|
+
pipeline,
|
|
8
|
+
AutoModelForSeq2SeqLM,
|
|
9
|
+
AutoTokenizer,
|
|
10
|
+
GenerationConfig
|
|
11
|
+
)
|
|
12
|
+
from transformers import (
|
|
13
|
+
WhisperForConditionalGeneration,
|
|
14
|
+
WhisperProcessor,
|
|
15
|
+
WhisperTimeStampLogitsProcessor
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from langchain.chains.summarize import load_summarize_chain
|
|
19
|
+
from langchain.text_splitter import (
|
|
20
|
+
TokenTextSplitter,
|
|
21
|
+
)
|
|
22
|
+
from .abstract import AbstractLoader
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_video_id(url):
|
|
26
|
+
parts = url.split("?v=")
|
|
27
|
+
video_id = parts[1].split("&")[0]
|
|
28
|
+
return video_id
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseVideoLoader(AbstractLoader):
|
|
32
|
+
"""
|
|
33
|
+
Generating Video transcripts from Videos.
|
|
34
|
+
"""
|
|
35
|
+
_extension = ['.youtube']
|
|
36
|
+
encoding = 'utf-8'
|
|
37
|
+
chunk_size = 768
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
urls: List[str],
|
|
42
|
+
tokenizer: Callable[..., Any] = None,
|
|
43
|
+
text_splitter: Callable[..., Any] = None,
|
|
44
|
+
source_type: str = 'video',
|
|
45
|
+
language: str = "en",
|
|
46
|
+
video_path: Union[str, Path] = None,
|
|
47
|
+
**kwargs
|
|
48
|
+
):
|
|
49
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
50
|
+
self.urls = urls
|
|
51
|
+
self._task = kwargs.get('task', "automatic-speech-recognition")
|
|
52
|
+
# Topics:
|
|
53
|
+
self.topics: list = kwargs.get('topics', [])
|
|
54
|
+
self._model_size: str = kwargs.get('model_size', 'medium')
|
|
55
|
+
self.summarization_model = "facebook/bart-large-cnn"
|
|
56
|
+
self._model_name: str = kwargs.get('model_name', 'whisper')
|
|
57
|
+
self.summarizer = pipeline(
|
|
58
|
+
"summarization",
|
|
59
|
+
tokenizer=AutoTokenizer.from_pretrained(
|
|
60
|
+
self.summarization_model
|
|
61
|
+
),
|
|
62
|
+
model=AutoModelForSeq2SeqLM.from_pretrained(
|
|
63
|
+
self.summarization_model
|
|
64
|
+
),
|
|
65
|
+
device=self._get_device()
|
|
66
|
+
)
|
|
67
|
+
# language:
|
|
68
|
+
self._language = language
|
|
69
|
+
# directory:
|
|
70
|
+
if isinstance(video_path, str):
|
|
71
|
+
self._video_path = Path(video_path).resolve()
|
|
72
|
+
self._video_path = video_path
|
|
73
|
+
|
|
74
|
+
def transcript_to_vtt(self, transcript: str, transcript_path: Path) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Convert a transcript to VTT format.
|
|
77
|
+
"""
|
|
78
|
+
vtt = "WEBVTT\n\n"
|
|
79
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
80
|
+
start, end = chunk['timestamp']
|
|
81
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
82
|
+
|
|
83
|
+
if start is None or end is None:
|
|
84
|
+
print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Convert timestamps to WebVTT format (HH:MM:SS.MMM)
|
|
88
|
+
start_vtt = f"{int(start // 3600):02}:{int(start % 3600 // 60):02}:{int(start % 60):02}.{int(start * 1000 % 1000):03}"
|
|
89
|
+
end_vtt = f"{int(end // 3600):02}:{int(end % 3600 // 60):02}:{int(end % 60):02}.{int(end * 1000 % 1000):03}"
|
|
90
|
+
|
|
91
|
+
vtt += f"{i}\n{start_vtt} --> {end_vtt}\n{text}\n\n"
|
|
92
|
+
# Save the VTT file
|
|
93
|
+
try:
|
|
94
|
+
with open(str(transcript_path), "w") as f:
|
|
95
|
+
f.write(vtt)
|
|
96
|
+
print(f'Saved VTT File on {transcript_path}')
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
print(f"Error saving VTT file: {exc}")
|
|
99
|
+
return vtt
|
|
100
|
+
|
|
101
|
+
def format_timestamp(self, seconds):
|
|
102
|
+
# This helper function takes the total seconds and formats it into hh:mm:ss,ms
|
|
103
|
+
hours, remainder = divmod(int(seconds), 3600)
|
|
104
|
+
minutes, seconds = divmod(remainder, 60)
|
|
105
|
+
milliseconds = int((seconds % 1) * 1000)
|
|
106
|
+
seconds = int(seconds)
|
|
107
|
+
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
|
|
108
|
+
|
|
109
|
+
def transcript_to_blocks(self, transcript: str) -> list:
|
|
110
|
+
"""
|
|
111
|
+
Convert a transcript to blocks.
|
|
112
|
+
"""
|
|
113
|
+
blocks = []
|
|
114
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
115
|
+
current_window = {}
|
|
116
|
+
start, end = chunk['timestamp']
|
|
117
|
+
if start is None or end is None:
|
|
118
|
+
print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
start_srt = self.format_timestamp(start)
|
|
122
|
+
end_srt = self.format_timestamp(end)
|
|
123
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
124
|
+
current_window['id'] = i
|
|
125
|
+
current_window['start_time'] = start_srt
|
|
126
|
+
current_window['end_time'] = end_srt
|
|
127
|
+
current_window['text'] = text
|
|
128
|
+
blocks.append(current_window)
|
|
129
|
+
return blocks
|
|
130
|
+
|
|
131
|
+
def transcript_to_srt(self, transcript: str) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Convert a transcript to SRT format.
|
|
134
|
+
"""
|
|
135
|
+
# lines = transcript.split("\n")
|
|
136
|
+
srt = ""
|
|
137
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
138
|
+
start, end = chunk['timestamp']
|
|
139
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
140
|
+
# Convert start and end times to SRT format HH:MM:SS,MS
|
|
141
|
+
start_srt = f"{start // 3600:02}:{start % 3600 // 60:02}:{start % 60:02},{int(start * 1000 % 1000):03}"
|
|
142
|
+
end_srt = f"{end // 3600:02}:{end % 3600 // 60:02}:{end % 60:02},{int(end * 1000 % 1000):03}"
|
|
143
|
+
srt += f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n"
|
|
144
|
+
return srt
|
|
145
|
+
|
|
146
|
+
def chunk_text(self, text, chunk_size, tokenizer):
|
|
147
|
+
# Tokenize the text and get the number of tokens
|
|
148
|
+
tokens = tokenizer.tokenize(text)
|
|
149
|
+
# Split the tokens into chunks
|
|
150
|
+
for i in range(0, len(tokens), chunk_size):
|
|
151
|
+
yield tokenizer.convert_tokens_to_string(
|
|
152
|
+
tokens[i:i+chunk_size]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def get_summary(self, documents: list) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Get a summary of a text.
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
splitter = TokenTextSplitter(
|
|
161
|
+
chunk_size=5000,
|
|
162
|
+
chunk_overlap=100,
|
|
163
|
+
)
|
|
164
|
+
summarize_chain = load_summarize_chain(
|
|
165
|
+
llm=self._llm,
|
|
166
|
+
chain_type="refine"
|
|
167
|
+
)
|
|
168
|
+
chunks = splitter.split_documents(documents)
|
|
169
|
+
summary = summarize_chain.invoke(chunks)
|
|
170
|
+
return summary
|
|
171
|
+
except Exception as e:
|
|
172
|
+
print('ERROR in get_summary:', e)
|
|
173
|
+
return ""
|
|
174
|
+
|
|
175
|
+
def summarization(self, text: str) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Get a summary of a text considering token limits.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
tokenizer = self.summarizer.tokenizer
|
|
181
|
+
# to be safe under the limit
|
|
182
|
+
max_length = tokenizer.model_max_length - 10
|
|
183
|
+
summaries = []
|
|
184
|
+
for text_chunk in self.chunk_text(text, max_length, tokenizer):
|
|
185
|
+
chunk_summary = self.summarizer(
|
|
186
|
+
text_chunk,
|
|
187
|
+
max_length=150,
|
|
188
|
+
min_length=30,
|
|
189
|
+
do_sample=False)[0]['summary_text']
|
|
190
|
+
summaries.append(chunk_summary)
|
|
191
|
+
return " ".join(summaries)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
print('ERROR in summarization:', e)
|
|
194
|
+
return ""
|
|
195
|
+
|
|
196
|
+
def extract_audio(self, video_path, audio_path):
|
|
197
|
+
"""
|
|
198
|
+
Extracts the audio from a video file and saves it as an audio file.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
video_path (str): Path to the video file.
|
|
202
|
+
audio_path (str): Path where the extracted audio file will be saved.
|
|
203
|
+
"""
|
|
204
|
+
if audio_path.exists():
|
|
205
|
+
print(f"Audio already extracted: {audio_path}")
|
|
206
|
+
return
|
|
207
|
+
video_clip = VideoFileClip(str(video_path))
|
|
208
|
+
audio_clip = video_clip.audio
|
|
209
|
+
if not audio_clip:
|
|
210
|
+
return
|
|
211
|
+
audio_clip.write_audiofile(str(audio_path))
|
|
212
|
+
audio_clip.close()
|
|
213
|
+
video_clip.close()
|
|
214
|
+
|
|
215
|
+
def get_whisper_transcript(self, audio_path: Path, chunk_length: int = 30):
|
|
216
|
+
# Initialize the Whisper parser
|
|
217
|
+
if self._model_name == 'whisper':
|
|
218
|
+
if self._language == 'en':
|
|
219
|
+
model_name = f"openai/whisper-{self._model_size}.en"
|
|
220
|
+
elif self._language == 'es':
|
|
221
|
+
model_name = f"juancopi81/whisper-{self._model_size}-es"
|
|
222
|
+
else:
|
|
223
|
+
model_name = "openai/whisper-large-v3"
|
|
224
|
+
else:
|
|
225
|
+
model_name = self._model_name
|
|
226
|
+
|
|
227
|
+
# Load the model and processor
|
|
228
|
+
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
|
229
|
+
processor = WhisperProcessor.from_pretrained(model_name)
|
|
230
|
+
|
|
231
|
+
# Try to load the generation config, fallback to default if it doesn't exist
|
|
232
|
+
try:
|
|
233
|
+
generation_config = GenerationConfig.from_pretrained(model_name)
|
|
234
|
+
except EnvironmentError:
|
|
235
|
+
print(
|
|
236
|
+
f"Warning: No generation_config.json found for model {model_name}. Using default configuration."
|
|
237
|
+
)
|
|
238
|
+
generation_config = model.generation_config
|
|
239
|
+
|
|
240
|
+
# Check and set the no_timestamps_token_id if it doesn't exist
|
|
241
|
+
if not hasattr(model.config, 'no_timestamps_token_id'):
|
|
242
|
+
model.config.no_timestamps_token_id = processor.tokenizer.convert_tokens_to_ids('<|notimestamps|>')
|
|
243
|
+
|
|
244
|
+
# Define the generation configuration with WhisperTimeStampLogitsProcessor
|
|
245
|
+
try:
|
|
246
|
+
model.config.logits_processor = [
|
|
247
|
+
WhisperTimeStampLogitsProcessor(generation_config)
|
|
248
|
+
]
|
|
249
|
+
except Exception:
|
|
250
|
+
model.config.logits_processor = [
|
|
251
|
+
WhisperTimeStampLogitsProcessor(model.config)
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
whisper_pipe = pipeline(
|
|
255
|
+
task=self._task,
|
|
256
|
+
model=model,
|
|
257
|
+
tokenizer=processor.tokenizer,
|
|
258
|
+
feature_extractor=processor.feature_extractor,
|
|
259
|
+
device=self._get_device(),
|
|
260
|
+
chunk_length_s=chunk_length,
|
|
261
|
+
use_fast=True
|
|
262
|
+
)
|
|
263
|
+
if audio_path.exists() and audio_path.stat().st_size > 0:
|
|
264
|
+
# Use the parser to extract transcript
|
|
265
|
+
return whisper_pipe(
|
|
266
|
+
str(audio_path),
|
|
267
|
+
return_timestamps=True
|
|
268
|
+
)
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
@abstractmethod
|
|
272
|
+
def load(self) -> list:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
@abstractmethod
|
|
276
|
+
def load_video(self, url: str, video_title: str, transcript: str) -> list:
|
|
277
|
+
pass
|
|
278
|
+
|
|
279
|
+
def parse(self, source):
|
|
280
|
+
pass
|
parrot/loaders/csv.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_community.document_loaders.csv_loader import CSVLoader as CSVL
|
|
3
|
+
from .abstract import AbstractLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CSVLoader(AbstractLoader):
|
|
7
|
+
"""
|
|
8
|
+
Loader for CSV files.
|
|
9
|
+
"""
|
|
10
|
+
_extension = ['.csv']
|
|
11
|
+
csv_args: dict = {
|
|
12
|
+
"delimiter": ",",
|
|
13
|
+
"quotechar": '"',
|
|
14
|
+
"escapechar": "\\",
|
|
15
|
+
"skipinitialspace": False,
|
|
16
|
+
"lineterminator": "\n",
|
|
17
|
+
"quoting": 0,
|
|
18
|
+
"skiprows": 0,
|
|
19
|
+
"encoding": None
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def load(self, path: PurePath) -> list:
|
|
23
|
+
"""
|
|
24
|
+
Load data from a CSV file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source (str): The path to the CSV file.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
list: A list of Langchain Documents.
|
|
31
|
+
"""
|
|
32
|
+
if self._check_path(path):
|
|
33
|
+
self.logger.info(f"Loading CSV file: {path}")
|
|
34
|
+
loader = CSVL(
|
|
35
|
+
file_path=path,
|
|
36
|
+
csv_args=self.csv_args,
|
|
37
|
+
autodetect_encoding=True
|
|
38
|
+
)
|
|
39
|
+
documents = loader.load()
|
|
40
|
+
return self.split_documents(documents)
|
|
41
|
+
else:
|
|
42
|
+
return []
|
parrot/loaders/dir.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from langchain_community.document_loaders import (
|
|
5
|
+
DirectoryLoader
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_directory(
|
|
10
|
+
path: PurePath,
|
|
11
|
+
text_splitter,
|
|
12
|
+
source_type,
|
|
13
|
+
file_pattern: str = "**/*.txt"
|
|
14
|
+
):
|
|
15
|
+
"""
|
|
16
|
+
Load all Text documents from a directory.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
path (str): The path to the directory.
|
|
20
|
+
text_splitter (TextSplitter): A text splitter object.
|
|
21
|
+
source_type (str): The type of source.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list: A list of documents.
|
|
25
|
+
"""
|
|
26
|
+
documents = []
|
|
27
|
+
loader = DirectoryLoader(
|
|
28
|
+
path=str(path),
|
|
29
|
+
glob=file_pattern,
|
|
30
|
+
recursive=True,
|
|
31
|
+
show_progress=True,
|
|
32
|
+
use_multithreading=True
|
|
33
|
+
)
|
|
34
|
+
documents = loader.load()
|
|
35
|
+
for doc in documents:
|
|
36
|
+
doc.metadata['source_type'] = source_type
|
|
37
|
+
return text_splitter.split_documents(documents)
|