ai-parrot 0.3.4__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/audio.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from .basevideo import BaseVideoLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AudioLoader(BaseVideoLoader):
|
|
9
|
+
"""
|
|
10
|
+
Generating transcripts from local Audio.
|
|
11
|
+
"""
|
|
12
|
+
_extension = ['.mp3', '.webm']
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
path: PurePath,
|
|
17
|
+
tokenizer: Callable[..., Any] = None,
|
|
18
|
+
text_splitter: Callable[..., Any] = None,
|
|
19
|
+
source_type: str = 'documentation',
|
|
20
|
+
encoding: str = 'utf-8',
|
|
21
|
+
origin: str = '',
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
25
|
+
self.path = path
|
|
26
|
+
|
|
27
|
+
def load_audio(self, path: PurePath) -> list:
|
|
28
|
+
metadata = {
|
|
29
|
+
"source": f"{path}",
|
|
30
|
+
"url": f"{path.name}",
|
|
31
|
+
"index": path.stem,
|
|
32
|
+
"filename": f"{path}",
|
|
33
|
+
"question": '',
|
|
34
|
+
"answer": '',
|
|
35
|
+
'type': 'audio_transcript',
|
|
36
|
+
"source_type": self._source_type,
|
|
37
|
+
"summary": '',
|
|
38
|
+
"document_meta": {
|
|
39
|
+
"language": self._language,
|
|
40
|
+
"topic_tags": ""
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
documents = []
|
|
44
|
+
transcript_path = path.with_suffix('.vtt')
|
|
45
|
+
# get the Whisper parser
|
|
46
|
+
transcript_whisper = self.get_whisper_transcript(path)
|
|
47
|
+
if transcript_whisper:
|
|
48
|
+
transcript = transcript_whisper['text']
|
|
49
|
+
else:
|
|
50
|
+
transcript = ''
|
|
51
|
+
# Summarize the transcript
|
|
52
|
+
if transcript:
|
|
53
|
+
summary = self.get_summary_from_text(transcript)
|
|
54
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
55
|
+
metadata['summary'] = summary
|
|
56
|
+
doc = Document(
|
|
57
|
+
page_content=transcript,
|
|
58
|
+
metadata=metadata
|
|
59
|
+
)
|
|
60
|
+
documents.append(doc)
|
|
61
|
+
if transcript_whisper:
|
|
62
|
+
# VTT version:
|
|
63
|
+
transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
|
|
64
|
+
doc = Document(
|
|
65
|
+
page_content=transcript,
|
|
66
|
+
metadata=metadata
|
|
67
|
+
)
|
|
68
|
+
documents.append(doc)
|
|
69
|
+
# Saving every dialog chunk as a separate document
|
|
70
|
+
dialogs = self.transcript_to_blocks(transcript_whisper)
|
|
71
|
+
docs = []
|
|
72
|
+
for chunk in dialogs:
|
|
73
|
+
_meta = {
|
|
74
|
+
"index": f"{path.stem}:{chunk['id']}",
|
|
75
|
+
"document_meta": {
|
|
76
|
+
"start": f"{chunk['start_time']}",
|
|
77
|
+
"end": f"{chunk['end_time']}",
|
|
78
|
+
"id": f"{chunk['id']}",
|
|
79
|
+
"language": self._language,
|
|
80
|
+
"title": f"{path.stem}",
|
|
81
|
+
"topic_tags": ""
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
_info = {**metadata, **_meta}
|
|
85
|
+
doc = Document(
|
|
86
|
+
page_content=chunk['text'],
|
|
87
|
+
metadata=_info
|
|
88
|
+
)
|
|
89
|
+
docs.append(doc)
|
|
90
|
+
documents.extend(docs)
|
|
91
|
+
return documents
|
|
92
|
+
|
|
93
|
+
def load(self) -> list:
|
|
94
|
+
documents = []
|
|
95
|
+
if self.path.is_file():
|
|
96
|
+
docs = self.load_audio(self.path)
|
|
97
|
+
documents.extend(docs)
|
|
98
|
+
if self.path.is_dir():
|
|
99
|
+
# iterate over the files in the directory
|
|
100
|
+
for ext in self._extension:
|
|
101
|
+
for item in self.path.glob(f'*{ext}'):
|
|
102
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
103
|
+
documents.extend(
|
|
104
|
+
self.load_audio(item)
|
|
105
|
+
)
|
|
106
|
+
return self.split_documents(documents)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from pathlib import Path, PurePath
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
from ..conf import STATIC_DIR
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BasePDF(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Base Abstract loader for all PDF files.
|
|
13
|
+
"""
|
|
14
|
+
_extension = ['.pdf']
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
path: PurePath,
|
|
19
|
+
tokenizer: Callable[..., Any] = None,
|
|
20
|
+
text_splitter: Callable[..., Any] = None,
|
|
21
|
+
source_type: str = 'pdf',
|
|
22
|
+
language: str = "eng",
|
|
23
|
+
**kwargs
|
|
24
|
+
):
|
|
25
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
26
|
+
self.path = path
|
|
27
|
+
if isinstance(path, str):
|
|
28
|
+
self.path = Path(path).resolve()
|
|
29
|
+
self.save_images: bool = bool(kwargs.get('save_images', False))
|
|
30
|
+
self._imgdir = STATIC_DIR.joinpath('images')
|
|
31
|
+
if self.save_images is True:
|
|
32
|
+
if self._imgdir.exists() is False:
|
|
33
|
+
self._imgdir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
if language == 'en':
|
|
35
|
+
language = 'eng'
|
|
36
|
+
self._lang = language
|
|
37
|
+
|
|
38
|
+
def save_image(self, img_stream: Image, image_name: str, save_path: Path):
|
|
39
|
+
# Create the image directory if it does not exist
|
|
40
|
+
if save_path.exists() is False:
|
|
41
|
+
save_path.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
img_path = save_path.joinpath(image_name)
|
|
43
|
+
self.logger.notice(
|
|
44
|
+
f"Saving Image Page on {img_path}"
|
|
45
|
+
)
|
|
46
|
+
if not img_path.exists():
|
|
47
|
+
# Save the image
|
|
48
|
+
img_stream.save(img_path, format="PNG", optimize=True)
|
|
49
|
+
return img_path
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def _load_pdf(self, path: Path) -> list:
|
|
53
|
+
"""
|
|
54
|
+
Load a PDF file using Fitz.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
path (Path): The path to the PDF file.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
list: A list of Langchain Documents.
|
|
61
|
+
"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
def load(self) -> list:
|
|
65
|
+
"""
|
|
66
|
+
Load data from a PDF file.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
source (str): The path to the PDF file.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
list: A list of Langchain Documents.
|
|
73
|
+
"""
|
|
74
|
+
if isinstance(self.path, list):
|
|
75
|
+
# list of files:
|
|
76
|
+
documents = []
|
|
77
|
+
for p in self.path:
|
|
78
|
+
if self._check_path(p):
|
|
79
|
+
documents.extend(self._load_pdf(p))
|
|
80
|
+
if not self.path.exists():
|
|
81
|
+
raise FileNotFoundError(
|
|
82
|
+
f"PDF file/directory not found: {self.path}"
|
|
83
|
+
)
|
|
84
|
+
if self.path.is_dir():
|
|
85
|
+
documents = []
|
|
86
|
+
# iterate over the files in the directory
|
|
87
|
+
for ext in self._extension:
|
|
88
|
+
for item in self.path.glob(f'*{ext}'):
|
|
89
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
90
|
+
documents.extend(self._load_pdf(item))
|
|
91
|
+
elif self.path.is_file():
|
|
92
|
+
documents = self._load_pdf(self.path)
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"PDF Loader: Invalid path: {self.path}"
|
|
96
|
+
)
|
|
97
|
+
return self.split_documents(documents)
|
|
98
|
+
|
|
99
|
+
def parse(self, source):
|
|
100
|
+
raise NotImplementedError(
|
|
101
|
+
"Parser method is not implemented for PDFLoader."
|
|
102
|
+
)
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Union, List
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from moviepy.editor import VideoFileClip
|
|
6
|
+
from transformers import (
|
|
7
|
+
pipeline,
|
|
8
|
+
AutoModelForSeq2SeqLM,
|
|
9
|
+
AutoTokenizer,
|
|
10
|
+
GenerationConfig
|
|
11
|
+
)
|
|
12
|
+
from transformers import (
|
|
13
|
+
WhisperForConditionalGeneration,
|
|
14
|
+
WhisperProcessor,
|
|
15
|
+
WhisperTimeStampLogitsProcessor
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from langchain.chains.summarize import load_summarize_chain
|
|
19
|
+
from langchain.text_splitter import (
|
|
20
|
+
TokenTextSplitter,
|
|
21
|
+
)
|
|
22
|
+
from .abstract import AbstractLoader
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_video_id(url):
|
|
26
|
+
parts = url.split("?v=")
|
|
27
|
+
video_id = parts[1].split("&")[0]
|
|
28
|
+
return video_id
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseVideoLoader(AbstractLoader):
|
|
32
|
+
"""
|
|
33
|
+
Generating Video transcripts from Videos.
|
|
34
|
+
"""
|
|
35
|
+
_extension = ['.youtube']
|
|
36
|
+
encoding = 'utf-8'
|
|
37
|
+
chunk_size = 768
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
urls: List[str],
|
|
42
|
+
tokenizer: Callable[..., Any] = None,
|
|
43
|
+
text_splitter: Callable[..., Any] = None,
|
|
44
|
+
source_type: str = 'video',
|
|
45
|
+
language: str = "en",
|
|
46
|
+
video_path: Union[str, Path] = None,
|
|
47
|
+
**kwargs
|
|
48
|
+
):
|
|
49
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
50
|
+
self.urls = urls
|
|
51
|
+
self._task = kwargs.get('task', "automatic-speech-recognition")
|
|
52
|
+
# Topics:
|
|
53
|
+
self.topics: list = kwargs.get('topics', [])
|
|
54
|
+
self._model_size: str = kwargs.get('model_size', 'medium')
|
|
55
|
+
self.summarization_model = "facebook/bart-large-cnn"
|
|
56
|
+
self._model_name: str = kwargs.get('model_name', 'whisper')
|
|
57
|
+
self.summarizer = pipeline(
|
|
58
|
+
"summarization",
|
|
59
|
+
tokenizer=AutoTokenizer.from_pretrained(
|
|
60
|
+
self.summarization_model
|
|
61
|
+
),
|
|
62
|
+
model=AutoModelForSeq2SeqLM.from_pretrained(
|
|
63
|
+
self.summarization_model
|
|
64
|
+
),
|
|
65
|
+
device=self._get_device()
|
|
66
|
+
)
|
|
67
|
+
# language:
|
|
68
|
+
self._language = language
|
|
69
|
+
# directory:
|
|
70
|
+
if isinstance(video_path, str):
|
|
71
|
+
self._video_path = Path(video_path).resolve()
|
|
72
|
+
self._video_path = video_path
|
|
73
|
+
|
|
74
|
+
def transcript_to_vtt(self, transcript: str, transcript_path: Path) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Convert a transcript to VTT format.
|
|
77
|
+
"""
|
|
78
|
+
vtt = "WEBVTT\n\n"
|
|
79
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
80
|
+
start, end = chunk['timestamp']
|
|
81
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
82
|
+
|
|
83
|
+
if start is None or end is None:
|
|
84
|
+
print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Convert timestamps to WebVTT format (HH:MM:SS.MMM)
|
|
88
|
+
start_vtt = f"{int(start // 3600):02}:{int(start % 3600 // 60):02}:{int(start % 60):02}.{int(start * 1000 % 1000):03}"
|
|
89
|
+
end_vtt = f"{int(end // 3600):02}:{int(end % 3600 // 60):02}:{int(end % 60):02}.{int(end * 1000 % 1000):03}"
|
|
90
|
+
|
|
91
|
+
vtt += f"{i}\n{start_vtt} --> {end_vtt}\n{text}\n\n"
|
|
92
|
+
# Save the VTT file
|
|
93
|
+
try:
|
|
94
|
+
with open(str(transcript_path), "w") as f:
|
|
95
|
+
f.write(vtt)
|
|
96
|
+
print(f'Saved VTT File on {transcript_path}')
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
print(f"Error saving VTT file: {exc}")
|
|
99
|
+
return vtt
|
|
100
|
+
|
|
101
|
+
def format_timestamp(self, seconds):
|
|
102
|
+
# This helper function takes the total seconds and formats it into hh:mm:ss,ms
|
|
103
|
+
hours, remainder = divmod(int(seconds), 3600)
|
|
104
|
+
minutes, seconds = divmod(remainder, 60)
|
|
105
|
+
milliseconds = int((seconds % 1) * 1000)
|
|
106
|
+
seconds = int(seconds)
|
|
107
|
+
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
|
|
108
|
+
|
|
109
|
+
def transcript_to_blocks(self, transcript: str) -> list:
|
|
110
|
+
"""
|
|
111
|
+
Convert a transcript to blocks.
|
|
112
|
+
"""
|
|
113
|
+
blocks = []
|
|
114
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
115
|
+
current_window = {}
|
|
116
|
+
start, end = chunk['timestamp']
|
|
117
|
+
if start is None or end is None:
|
|
118
|
+
print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
start_srt = self.format_timestamp(start)
|
|
122
|
+
end_srt = self.format_timestamp(end)
|
|
123
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
124
|
+
current_window['id'] = i
|
|
125
|
+
current_window['start_time'] = start_srt
|
|
126
|
+
current_window['end_time'] = end_srt
|
|
127
|
+
current_window['text'] = text
|
|
128
|
+
blocks.append(current_window)
|
|
129
|
+
return blocks
|
|
130
|
+
|
|
131
|
+
def transcript_to_srt(self, transcript: str) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Convert a transcript to SRT format.
|
|
134
|
+
"""
|
|
135
|
+
# lines = transcript.split("\n")
|
|
136
|
+
srt = ""
|
|
137
|
+
for i, chunk in enumerate(transcript['chunks'], start=1):
|
|
138
|
+
start, end = chunk['timestamp']
|
|
139
|
+
text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
|
|
140
|
+
# Convert start and end times to SRT format HH:MM:SS,MS
|
|
141
|
+
start_srt = f"{start // 3600:02}:{start % 3600 // 60:02}:{start % 60:02},{int(start * 1000 % 1000):03}"
|
|
142
|
+
end_srt = f"{end // 3600:02}:{end % 3600 // 60:02}:{end % 60:02},{int(end * 1000 % 1000):03}"
|
|
143
|
+
srt += f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n"
|
|
144
|
+
return srt
|
|
145
|
+
|
|
146
|
+
def chunk_text(self, text, chunk_size, tokenizer):
|
|
147
|
+
# Tokenize the text and get the number of tokens
|
|
148
|
+
tokens = tokenizer.tokenize(text)
|
|
149
|
+
# Split the tokens into chunks
|
|
150
|
+
for i in range(0, len(tokens), chunk_size):
|
|
151
|
+
yield tokenizer.convert_tokens_to_string(
|
|
152
|
+
tokens[i:i+chunk_size]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def get_summary(self, documents: list) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Get a summary of a text.
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
splitter = TokenTextSplitter(
|
|
161
|
+
chunk_size=5000,
|
|
162
|
+
chunk_overlap=100,
|
|
163
|
+
)
|
|
164
|
+
summarize_chain = load_summarize_chain(
|
|
165
|
+
llm=self._llm,
|
|
166
|
+
chain_type="refine"
|
|
167
|
+
)
|
|
168
|
+
chunks = splitter.split_documents(documents)
|
|
169
|
+
summary = summarize_chain.invoke(chunks)
|
|
170
|
+
return summary
|
|
171
|
+
except Exception as e:
|
|
172
|
+
print('ERROR in get_summary:', e)
|
|
173
|
+
return ""
|
|
174
|
+
|
|
175
|
+
def summarization(self, text: str) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Get a summary of a text considering token limits.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
tokenizer = self.summarizer.tokenizer
|
|
181
|
+
# to be safe under the limit
|
|
182
|
+
max_length = tokenizer.model_max_length - 10
|
|
183
|
+
summaries = []
|
|
184
|
+
for text_chunk in self.chunk_text(text, max_length, tokenizer):
|
|
185
|
+
chunk_summary = self.summarizer(
|
|
186
|
+
text_chunk,
|
|
187
|
+
max_length=150,
|
|
188
|
+
min_length=30,
|
|
189
|
+
do_sample=False)[0]['summary_text']
|
|
190
|
+
summaries.append(chunk_summary)
|
|
191
|
+
return " ".join(summaries)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
print('ERROR in summarization:', e)
|
|
194
|
+
return ""
|
|
195
|
+
|
|
196
|
+
def extract_audio(self, video_path, audio_path):
|
|
197
|
+
"""
|
|
198
|
+
Extracts the audio from a video file and saves it as an audio file.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
video_path (str): Path to the video file.
|
|
202
|
+
audio_path (str): Path where the extracted audio file will be saved.
|
|
203
|
+
"""
|
|
204
|
+
if audio_path.exists():
|
|
205
|
+
print(f"Audio already extracted: {audio_path}")
|
|
206
|
+
return
|
|
207
|
+
video_clip = VideoFileClip(str(video_path))
|
|
208
|
+
audio_clip = video_clip.audio
|
|
209
|
+
if not audio_clip:
|
|
210
|
+
return
|
|
211
|
+
audio_clip.write_audiofile(str(audio_path))
|
|
212
|
+
audio_clip.close()
|
|
213
|
+
video_clip.close()
|
|
214
|
+
|
|
215
|
+
def get_whisper_transcript(self, audio_path: Path, chunk_length: int = 30):
|
|
216
|
+
# Initialize the Whisper parser
|
|
217
|
+
if self._model_name == 'whisper':
|
|
218
|
+
if self._language == 'en':
|
|
219
|
+
model_name = f"openai/whisper-{self._model_size}.en"
|
|
220
|
+
elif self._language == 'es':
|
|
221
|
+
model_name = f"juancopi81/whisper-{self._model_size}-es"
|
|
222
|
+
else:
|
|
223
|
+
model_name = "openai/whisper-large-v3"
|
|
224
|
+
else:
|
|
225
|
+
model_name = self._model_name
|
|
226
|
+
|
|
227
|
+
# Load the model and processor
|
|
228
|
+
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
|
229
|
+
processor = WhisperProcessor.from_pretrained(model_name)
|
|
230
|
+
|
|
231
|
+
# Try to load the generation config, fallback to default if it doesn't exist
|
|
232
|
+
try:
|
|
233
|
+
generation_config = GenerationConfig.from_pretrained(model_name)
|
|
234
|
+
except EnvironmentError:
|
|
235
|
+
print(
|
|
236
|
+
f"Warning: No generation_config.json found for model {model_name}. Using default configuration."
|
|
237
|
+
)
|
|
238
|
+
generation_config = model.generation_config
|
|
239
|
+
|
|
240
|
+
# Check and set the no_timestamps_token_id if it doesn't exist
|
|
241
|
+
if not hasattr(model.config, 'no_timestamps_token_id'):
|
|
242
|
+
model.config.no_timestamps_token_id = processor.tokenizer.convert_tokens_to_ids('<|notimestamps|>')
|
|
243
|
+
|
|
244
|
+
# Define the generation configuration with WhisperTimeStampLogitsProcessor
|
|
245
|
+
try:
|
|
246
|
+
model.config.logits_processor = [
|
|
247
|
+
WhisperTimeStampLogitsProcessor(generation_config)
|
|
248
|
+
]
|
|
249
|
+
except Exception:
|
|
250
|
+
model.config.logits_processor = [
|
|
251
|
+
WhisperTimeStampLogitsProcessor(model.config)
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
whisper_pipe = pipeline(
|
|
255
|
+
task=self._task,
|
|
256
|
+
model=model,
|
|
257
|
+
tokenizer=processor.tokenizer,
|
|
258
|
+
feature_extractor=processor.feature_extractor,
|
|
259
|
+
device=self._get_device(),
|
|
260
|
+
chunk_length_s=chunk_length,
|
|
261
|
+
use_fast=True
|
|
262
|
+
)
|
|
263
|
+
if audio_path.exists() and audio_path.stat().st_size > 0:
|
|
264
|
+
# Use the parser to extract transcript
|
|
265
|
+
return whisper_pipe(
|
|
266
|
+
str(audio_path),
|
|
267
|
+
return_timestamps=True
|
|
268
|
+
)
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
@abstractmethod
|
|
272
|
+
def load(self) -> list:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
@abstractmethod
|
|
276
|
+
def load_video(self, url: str, video_title: str, transcript: str) -> list:
|
|
277
|
+
pass
|
|
278
|
+
|
|
279
|
+
def parse(self, source):
|
|
280
|
+
pass
|
parrot/loaders/csv.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_community.document_loaders.csv_loader import CSVLoader as CSVL
|
|
3
|
+
from .abstract import AbstractLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CSVLoader(AbstractLoader):
|
|
7
|
+
"""
|
|
8
|
+
Loader for CSV files.
|
|
9
|
+
"""
|
|
10
|
+
_extension = ['.csv']
|
|
11
|
+
csv_args: dict = {
|
|
12
|
+
"delimiter": ",",
|
|
13
|
+
"quotechar": '"',
|
|
14
|
+
"escapechar": "\\",
|
|
15
|
+
"skipinitialspace": False,
|
|
16
|
+
"lineterminator": "\n",
|
|
17
|
+
"quoting": 0,
|
|
18
|
+
"skiprows": 0,
|
|
19
|
+
"encoding": None
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def load(self, path: PurePath) -> list:
|
|
23
|
+
"""
|
|
24
|
+
Load data from a CSV file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source (str): The path to the CSV file.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
list: A list of Langchain Documents.
|
|
31
|
+
"""
|
|
32
|
+
if self._check_path(path):
|
|
33
|
+
self.logger.info(f"Loading CSV file: {path}")
|
|
34
|
+
loader = CSVL(
|
|
35
|
+
file_path=path,
|
|
36
|
+
csv_args=self.csv_args,
|
|
37
|
+
autodetect_encoding=True
|
|
38
|
+
)
|
|
39
|
+
documents = loader.load()
|
|
40
|
+
return self.split_documents(documents)
|
|
41
|
+
else:
|
|
42
|
+
return []
|
parrot/loaders/dir.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from langchain_community.document_loaders import (
|
|
5
|
+
DirectoryLoader
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_directory(
|
|
10
|
+
path: PurePath,
|
|
11
|
+
text_splitter,
|
|
12
|
+
source_type,
|
|
13
|
+
file_pattern: str = "**/*.txt"
|
|
14
|
+
):
|
|
15
|
+
"""
|
|
16
|
+
Load all Text documents from a directory.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
path (str): The path to the directory.
|
|
20
|
+
text_splitter (TextSplitter): A text splitter object.
|
|
21
|
+
source_type (str): The type of source.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list: A list of documents.
|
|
25
|
+
"""
|
|
26
|
+
documents = []
|
|
27
|
+
loader = DirectoryLoader(
|
|
28
|
+
path=str(path),
|
|
29
|
+
glob=file_pattern,
|
|
30
|
+
recursive=True,
|
|
31
|
+
show_progress=True,
|
|
32
|
+
use_multithreading=True
|
|
33
|
+
)
|
|
34
|
+
documents = loader.load()
|
|
35
|
+
for doc in documents:
|
|
36
|
+
doc.metadata['source_type'] = source_type
|
|
37
|
+
return text_splitter.split_documents(documents)
|