ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.1.0.dist-info/LICENSE +21 -0
- ai_parrot-0.1.0.dist-info/METADATA +299 -0
- ai_parrot-0.1.0.dist-info/RECORD +108 -0
- ai_parrot-0.1.0.dist-info/WHEEL +5 -0
- ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
- parrot/__init__.py +18 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +965 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +257 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +100 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/oddie.py +17 -0
- parrot/chatbots/retrievals/__init__.py +515 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +108 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +169 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +0 -0
- parrot/llms/abstract.py +41 -0
- parrot/llms/anthropic.py +36 -0
- parrot/llms/google.py +37 -0
- parrot/llms/groq.py +33 -0
- parrot/llms/hf.py +39 -0
- parrot/llms/openai.py +49 -0
- parrot/llms/pipes.py +103 -0
- parrot/llms/vertex.py +68 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +187 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +107 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +152 -0
- parrot/models.py +347 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +0 -0
- parrot/stores/abstract.py +170 -0
- parrot/stores/milvus.py +540 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +16 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/rtd.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
import re
|
|
5
|
+
from langchain_community.document_loaders import ReadTheDocsLoader as RTLoader
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReadTheDocsLoader(AbstractLoader):
|
|
10
|
+
"""
|
|
11
|
+
Loading documents from ReadTheDocs.
|
|
12
|
+
"""
|
|
13
|
+
_extension: list = []
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'documentation',
|
|
21
|
+
encoding: str = 'utf-8',
|
|
22
|
+
origin: str = '',
|
|
23
|
+
**kwargs
|
|
24
|
+
):
|
|
25
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
26
|
+
self.path = path
|
|
27
|
+
self.encoding = encoding
|
|
28
|
+
self.origin = origin
|
|
29
|
+
self._prefix = ''
|
|
30
|
+
match = re.search(r'://([^/]+)', origin)
|
|
31
|
+
if match:
|
|
32
|
+
self._prefix = match.group(1)
|
|
33
|
+
|
|
34
|
+
def load(self) -> list:
|
|
35
|
+
documents = []
|
|
36
|
+
self.logger.info(
|
|
37
|
+
f"Loading ReadTheDocs from: {self.path}"
|
|
38
|
+
)
|
|
39
|
+
rt_loader = RTLoader(path=self.path, encoding=self.encoding)
|
|
40
|
+
documents = rt_loader.load()
|
|
41
|
+
for doc in documents:
|
|
42
|
+
src = doc.metadata.get('source')
|
|
43
|
+
suffix = src.replace(f'{self.path}', '').replace(self._prefix, '')
|
|
44
|
+
if suffix.startswith('//'):
|
|
45
|
+
suffix = suffix[1:]
|
|
46
|
+
url = f"{self.origin}{suffix}"
|
|
47
|
+
metadata = {
|
|
48
|
+
"source": url,
|
|
49
|
+
"url": url,
|
|
50
|
+
"index": suffix,
|
|
51
|
+
"filename": src,
|
|
52
|
+
"question": '',
|
|
53
|
+
"answer": '',
|
|
54
|
+
'type': 'documentation',
|
|
55
|
+
"source_type": self._source_type,
|
|
56
|
+
"summary": '',
|
|
57
|
+
"document_meta": {
|
|
58
|
+
**doc.metadata
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
doc.metadata = metadata
|
|
62
|
+
return documents
|
|
63
|
+
|
|
64
|
+
def parse(self, source):
|
|
65
|
+
pass
|
parrot/loaders/txt.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
from langchain_community.document_loaders import TextLoader
|
|
5
|
+
from langchain.docstore.document import Document
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TXTLoader(AbstractLoader):
|
|
10
|
+
"""
|
|
11
|
+
Loader for PDF files.
|
|
12
|
+
"""
|
|
13
|
+
_extension = ['.txt']
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'text',
|
|
21
|
+
**kwargs
|
|
22
|
+
):
|
|
23
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
24
|
+
self.path = path
|
|
25
|
+
if isinstance(path, str):
|
|
26
|
+
self.path = Path(path).resolve()
|
|
27
|
+
|
|
28
|
+
def _load_document(self, path: Path) -> list:
|
|
29
|
+
"""
|
|
30
|
+
Load a TXT file.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path (Path): The path to the TXT file.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
list: A list of Langchain Documents.
|
|
37
|
+
"""
|
|
38
|
+
if self._check_path(path):
|
|
39
|
+
self.logger.info(f"Loading TXT file: {path}")
|
|
40
|
+
with open(path, 'r') as file:
|
|
41
|
+
text = file.read()
|
|
42
|
+
return [
|
|
43
|
+
Document(
|
|
44
|
+
page_content=text,
|
|
45
|
+
metadata={
|
|
46
|
+
"url": '',
|
|
47
|
+
"index": str(path.name),
|
|
48
|
+
"source": str(path),
|
|
49
|
+
"filename": str(path.name),
|
|
50
|
+
"summary": '',
|
|
51
|
+
"question": '',
|
|
52
|
+
"answer": '',
|
|
53
|
+
'type': 'text',
|
|
54
|
+
"source_type": self._source_type,
|
|
55
|
+
"document_meta": {}
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
]
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
def load(self) -> list:
|
|
62
|
+
"""
|
|
63
|
+
Load data from a TXT file.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
source (str): The path to the TXT file.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
list: A list of Langchain Documents.
|
|
70
|
+
"""
|
|
71
|
+
if not self.path.exists():
|
|
72
|
+
raise FileNotFoundError(
|
|
73
|
+
f"File/directory not found: {self.path}"
|
|
74
|
+
)
|
|
75
|
+
if self.path.is_dir():
|
|
76
|
+
documents = []
|
|
77
|
+
# iterate over the files in the directory
|
|
78
|
+
for ext in self._extension:
|
|
79
|
+
for item in self.path.glob(f'*{ext}'):
|
|
80
|
+
documents.extend(self._load_document(item))
|
|
81
|
+
elif self.path.is_file():
|
|
82
|
+
documents = self._load_document(self.path)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"TXT Loader: Invalid path: {self.path}"
|
|
86
|
+
)
|
|
87
|
+
return self.split_documents(documents)
|
|
88
|
+
|
|
89
|
+
def parse(self, source):
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
"Parser method is not implemented for TXTLoader."
|
|
92
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .models import BotData
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from datamodel import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
def uuid_to_str(obj) -> str:
|
|
7
|
+
return str(obj)
|
|
8
|
+
|
|
9
|
+
class BotData(BaseModel):
|
|
10
|
+
chatbot_id: str = Field(primary_key=True, required=True, encoder=uuid_to_str)
|
|
11
|
+
name: str = Field(required=True)
|
|
12
|
+
source_type: str = Field(required=True, default='content')
|
|
13
|
+
category: str = Field(required=True, default='data')
|
|
14
|
+
tags: Optional[list[str]] = Field(required=False, default_factory=list)
|
|
15
|
+
document_type: str = Field(required=False, default='document')
|
|
16
|
+
loader: str = Field(required=True, default='TXTLoader')
|
|
17
|
+
source_path: Union[str,PurePath] = Field(required=False)
|
|
18
|
+
extensions: list[str] = Field(required=False)
|
|
19
|
+
data: Optional[Union[list,dict]] = Field(required=False)
|
|
20
|
+
arguments: Optional[dict] = Field(default_factory=dict)
|
|
21
|
+
version: int = Field(required=True, default=1)
|
|
22
|
+
updated_at: datetime = Field(required=False, default=datetime.now)
|
|
23
|
+
|
|
24
|
+
class Meta:
|
|
25
|
+
name: str = 'chatbots_data'
|
parrot/loaders/video.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Union, List
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import subprocess
|
|
6
|
+
from .basevideo import BaseVideoLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VideoLoader(BaseVideoLoader):
|
|
10
|
+
"""
|
|
11
|
+
Generating Video transcripts from Videos.
|
|
12
|
+
"""
|
|
13
|
+
_extension = ['.youtube']
|
|
14
|
+
encoding = 'utf-8'
|
|
15
|
+
chunk_size = 768
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
urls: List[str],
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'video',
|
|
23
|
+
language: str = "en",
|
|
24
|
+
video_path: Union[str, Path] = None,
|
|
25
|
+
**kwargs
|
|
26
|
+
):
|
|
27
|
+
super().__init__(
|
|
28
|
+
urls,
|
|
29
|
+
tokenizer,
|
|
30
|
+
text_splitter,
|
|
31
|
+
source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
video_path=video_path,
|
|
34
|
+
**kwargs
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def download_video(self, url: str, path: str) -> Path:
|
|
38
|
+
"""
|
|
39
|
+
Downloads a video from a URL using yt-dlp.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
video_url (str): The URL of the video to download.
|
|
43
|
+
output_path (str): The directory where the video will be saved.
|
|
44
|
+
"""
|
|
45
|
+
command = [
|
|
46
|
+
"yt-dlp",
|
|
47
|
+
"--get-filename",
|
|
48
|
+
url
|
|
49
|
+
]
|
|
50
|
+
try:
|
|
51
|
+
result = subprocess.run(command, check=True, stdout=subprocess.PIPE, text=True)
|
|
52
|
+
filename = result.stdout.strip() # Remove any trailing newline characters
|
|
53
|
+
file_path = path.joinpath(filename)
|
|
54
|
+
if file_path.exists():
|
|
55
|
+
print(f"Video already downloaded: {filename}")
|
|
56
|
+
return file_path
|
|
57
|
+
print(f"Downloading video: {filename}")
|
|
58
|
+
# after extracted filename, download the video
|
|
59
|
+
command = [
|
|
60
|
+
"yt-dlp",
|
|
61
|
+
url,
|
|
62
|
+
"-o",
|
|
63
|
+
str(file_path)
|
|
64
|
+
]
|
|
65
|
+
subprocess.run(command, check=True)
|
|
66
|
+
return file_path
|
|
67
|
+
except subprocess.CalledProcessError as e:
|
|
68
|
+
print(f"Error downloading video: {e}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def load(self) -> list:
|
|
72
|
+
documents = []
|
|
73
|
+
for url in self.urls:
|
|
74
|
+
transcript = None
|
|
75
|
+
if isinstance(url, dict):
|
|
76
|
+
path = list(url.keys())[0]
|
|
77
|
+
parts = url[path]
|
|
78
|
+
if isinstance(parts, str):
|
|
79
|
+
video_title = parts
|
|
80
|
+
elif isinstance(parts, dict):
|
|
81
|
+
video_title = parts['title']
|
|
82
|
+
transcript = parts.get('transcript', None)
|
|
83
|
+
url = path
|
|
84
|
+
else:
|
|
85
|
+
video_title = url
|
|
86
|
+
docs = self.load_video(url, video_title, transcript)
|
|
87
|
+
documents.extend(docs)
|
|
88
|
+
# return documents
|
|
89
|
+
return self.split_documents(documents)
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def load_video(self, url: str, video_title: str, transcript: str) -> list:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
def parse(self, source):
|
|
96
|
+
pass
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from .basevideo import BaseVideoLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VideoLocalLoader(BaseVideoLoader):
|
|
9
|
+
"""
|
|
10
|
+
Generating Video transcripts from local Videos.
|
|
11
|
+
"""
|
|
12
|
+
_extension = ['.mp4', '.webm']
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
path: PurePath,
|
|
17
|
+
tokenizer: Callable[..., Any] = None,
|
|
18
|
+
text_splitter: Callable[..., Any] = None,
|
|
19
|
+
source_type: str = 'documentation',
|
|
20
|
+
encoding: str = 'utf-8',
|
|
21
|
+
origin: str = '',
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
25
|
+
self.path = path
|
|
26
|
+
|
|
27
|
+
def load_video(self, path: PurePath) -> list:
|
|
28
|
+
metadata = {
|
|
29
|
+
"source": f"{path}",
|
|
30
|
+
"url": f"{path.name}",
|
|
31
|
+
"index": path.stem,
|
|
32
|
+
"filename": f"{path}",
|
|
33
|
+
"question": '',
|
|
34
|
+
"answer": '',
|
|
35
|
+
'type': 'video_transcript',
|
|
36
|
+
"source_type": self._source_type,
|
|
37
|
+
"summary": '',
|
|
38
|
+
"document_meta": {
|
|
39
|
+
"language": self._language,
|
|
40
|
+
"topic_tags": ""
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
documents = []
|
|
44
|
+
transcript_path = path.with_suffix('.vtt')
|
|
45
|
+
audio_path = path.with_suffix('.mp3')
|
|
46
|
+
# second: extract audio from File
|
|
47
|
+
self.extract_audio(path, audio_path)
|
|
48
|
+
# get the Whisper parser
|
|
49
|
+
transcript_whisper = self.get_whisper_transcript(audio_path)
|
|
50
|
+
if transcript_whisper:
|
|
51
|
+
transcript = transcript_whisper['text']
|
|
52
|
+
else:
|
|
53
|
+
transcript = ''
|
|
54
|
+
# Summarize the transcript
|
|
55
|
+
if transcript:
|
|
56
|
+
summary = self.get_summary_from_text(transcript)
|
|
57
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
58
|
+
metadata['summary'] = summary
|
|
59
|
+
doc = Document(
|
|
60
|
+
page_content=transcript,
|
|
61
|
+
metadata=metadata
|
|
62
|
+
)
|
|
63
|
+
documents.append(doc)
|
|
64
|
+
if transcript_whisper:
|
|
65
|
+
# VTT version:
|
|
66
|
+
transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
|
|
67
|
+
doc = Document(
|
|
68
|
+
page_content=transcript,
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
documents.append(doc)
|
|
72
|
+
# Saving every dialog chunk as a separate document
|
|
73
|
+
dialogs = self.transcript_to_blocks(transcript_whisper)
|
|
74
|
+
docs = []
|
|
75
|
+
for chunk in dialogs:
|
|
76
|
+
_meta = {
|
|
77
|
+
"index": f"{path.stem}:{chunk['id']}",
|
|
78
|
+
"document_meta": {
|
|
79
|
+
"start": f"{chunk['start_time']}",
|
|
80
|
+
"end": f"{chunk['end_time']}",
|
|
81
|
+
"id": f"{chunk['id']}",
|
|
82
|
+
"language": self._language,
|
|
83
|
+
"title": f"{path.stem}",
|
|
84
|
+
"topic_tags": ""
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
_info = {**metadata, **_meta}
|
|
88
|
+
doc = Document(
|
|
89
|
+
page_content=chunk['text'],
|
|
90
|
+
metadata=_info
|
|
91
|
+
)
|
|
92
|
+
docs.append(doc)
|
|
93
|
+
documents.extend(docs)
|
|
94
|
+
return documents
|
|
95
|
+
|
|
96
|
+
def load(self) -> list:
|
|
97
|
+
documents = []
|
|
98
|
+
if self.path.is_file():
|
|
99
|
+
docs = self.load_video(self.path)
|
|
100
|
+
documents.extend(docs)
|
|
101
|
+
if self.path.is_dir():
|
|
102
|
+
# iterate over the files in the directory
|
|
103
|
+
for ext in self._extension:
|
|
104
|
+
for item in self.path.glob(f'*{ext}'):
|
|
105
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
106
|
+
documents.extend(self.load_video(item))
|
|
107
|
+
return self.split_documents(documents)
|
parrot/loaders/vimeo.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from transformers import pipeline
|
|
3
|
+
import torch
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from .youtube import YoutubeLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VimeoLoader(YoutubeLoader):
|
|
9
|
+
"""
|
|
10
|
+
Loader for Vimeo videos.
|
|
11
|
+
"""
|
|
12
|
+
def load_video(self, url: str, video_title: str, transcript: Optional[Union[str, None]] = None) -> list:
|
|
13
|
+
metadata = {
|
|
14
|
+
"source": url,
|
|
15
|
+
"url": url,
|
|
16
|
+
"index": url,
|
|
17
|
+
"filename": video_title,
|
|
18
|
+
"question": '',
|
|
19
|
+
"answer": '',
|
|
20
|
+
'type': 'video_transcript',
|
|
21
|
+
"source_type": self._source_type,
|
|
22
|
+
"summary": '',
|
|
23
|
+
"document_meta": {
|
|
24
|
+
"language": self._language,
|
|
25
|
+
"title": video_title,
|
|
26
|
+
"topic_tags": ""
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
if self.topics:
|
|
30
|
+
metadata['document_meta']['topic_tags'] = self.topics
|
|
31
|
+
if transcript is None:
|
|
32
|
+
documents = []
|
|
33
|
+
docs = []
|
|
34
|
+
# first: download video
|
|
35
|
+
try:
|
|
36
|
+
file_path = self.download_video(url, self._video_path)
|
|
37
|
+
except Exception:
|
|
38
|
+
return []
|
|
39
|
+
if not file_path:
|
|
40
|
+
self.logger.warning(
|
|
41
|
+
f"Error downloading File for video: {self._video_path}"
|
|
42
|
+
)
|
|
43
|
+
return []
|
|
44
|
+
transcript_path = file_path.with_suffix('.vtt')
|
|
45
|
+
audio_path = file_path.with_suffix('.mp3')
|
|
46
|
+
# second: extract audio
|
|
47
|
+
self.extract_audio(file_path, audio_path)
|
|
48
|
+
# get the Whisper parser
|
|
49
|
+
transcript_whisper = self.get_whisper_transcript(audio_path)
|
|
50
|
+
if transcript_whisper:
|
|
51
|
+
transcript = transcript_whisper['text']
|
|
52
|
+
else:
|
|
53
|
+
transcript = ''
|
|
54
|
+
# Summarize the transcript
|
|
55
|
+
if transcript:
|
|
56
|
+
summary = self.get_summary_from_text(transcript)
|
|
57
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
58
|
+
metadata['summary'] = summary
|
|
59
|
+
doc = Document(
|
|
60
|
+
page_content=transcript,
|
|
61
|
+
metadata=metadata
|
|
62
|
+
)
|
|
63
|
+
documents.append(doc)
|
|
64
|
+
if transcript_whisper:
|
|
65
|
+
# VTT version:
|
|
66
|
+
transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
|
|
67
|
+
doc = Document(
|
|
68
|
+
page_content=transcript,
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
documents.append(doc)
|
|
72
|
+
# Saving every dialog chunk as a separate document
|
|
73
|
+
dialogs = self.transcript_to_blocks(transcript_whisper)
|
|
74
|
+
for chunk in dialogs:
|
|
75
|
+
_meta = {
|
|
76
|
+
"index": f"{video_title}:{chunk['id']}",
|
|
77
|
+
"document_meta": {
|
|
78
|
+
"start": f"{chunk['start_time']}",
|
|
79
|
+
"end": f"{chunk['end_time']}",
|
|
80
|
+
"id": f"{chunk['id']}",
|
|
81
|
+
"language": self._language,
|
|
82
|
+
"title": video_title,
|
|
83
|
+
"topic_tags": ""
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
_info = {**metadata, **_meta}
|
|
87
|
+
doc = Document(
|
|
88
|
+
page_content=chunk['text'],
|
|
89
|
+
metadata=_info
|
|
90
|
+
)
|
|
91
|
+
docs.append(doc)
|
|
92
|
+
documents.extend(docs)
|
|
93
|
+
return self.split_documents(documents)
|
|
94
|
+
else:
|
|
95
|
+
# using the transcript file
|
|
96
|
+
with open(transcript, 'r') as f:
|
|
97
|
+
transcript = f.read()
|
|
98
|
+
summary = self.get_summary_from_text(transcript)
|
|
99
|
+
transcript_whisper = None
|
|
100
|
+
metadata['summary'] = f"{summary!s}"
|
|
101
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
102
|
+
doc = Document(
|
|
103
|
+
page_content=transcript,
|
|
104
|
+
metadata=metadata
|
|
105
|
+
)
|
|
106
|
+
return self.split_documents([doc])
|