ai-parrot 0.3.4__cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/ppt.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_community.document_loaders import (
|
|
3
|
+
UnstructuredPowerPointLoader
|
|
4
|
+
)
|
|
5
|
+
from .abstract import AbstractLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PPTXLoader(AbstractLoader):
|
|
9
|
+
"""
|
|
10
|
+
Loader for PPTX files.
|
|
11
|
+
"""
|
|
12
|
+
_extension: list = ['.pptx']
|
|
13
|
+
|
|
14
|
+
def load(self, path: PurePath) -> list:
|
|
15
|
+
if self._check_path(path):
|
|
16
|
+
docs = []
|
|
17
|
+
self.logger.info(f"Loading PPTX file: {path}")
|
|
18
|
+
ppt_loader = UnstructuredPowerPointLoader(
|
|
19
|
+
file_path=str(path)
|
|
20
|
+
)
|
|
21
|
+
docs += ppt_loader.load()
|
|
22
|
+
for doc in docs:
|
|
23
|
+
doc.metadata['source_type'] = self._source_type
|
|
24
|
+
# Split the documents into chunks
|
|
25
|
+
return self.split_documents(docs)
|
|
26
|
+
else:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
def parse(self, source):
|
|
30
|
+
pass
|
parrot/loaders/qa.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from .abstract import AbstractLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QAFileLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Question and Answers File based on Excel.
|
|
13
|
+
"""
|
|
14
|
+
_extension = ['.xlsx']
|
|
15
|
+
chunk_size = 768
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'QA',
|
|
23
|
+
columns: list = ['Question', 'Answer'],
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
27
|
+
self.path = path
|
|
28
|
+
self._columns = columns
|
|
29
|
+
if isinstance(path, str):
|
|
30
|
+
self.path = Path(path).resolve()
|
|
31
|
+
if self.path.is_dir():
|
|
32
|
+
raise ValueError(
|
|
33
|
+
f"Currently only accepting single Files."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def _load_document(self, path: PurePath) -> list:
|
|
37
|
+
if path.exists():
|
|
38
|
+
print('Load QA Excel File: ', path)
|
|
39
|
+
df = pd.read_excel(path)
|
|
40
|
+
q = self._columns[0]
|
|
41
|
+
a = self._columns[1]
|
|
42
|
+
docs = []
|
|
43
|
+
for idx, row in df.iterrows():
|
|
44
|
+
# Question Document
|
|
45
|
+
doc = Document(
|
|
46
|
+
page_content=f"**Question:** {row[q]}: **Answer:** {row[a]}",
|
|
47
|
+
metadata={
|
|
48
|
+
"url": '',
|
|
49
|
+
"index": f"{path.name} #{idx}",
|
|
50
|
+
"source": f"{path.name} Row.#{idx}",
|
|
51
|
+
"filename": f"{path.name}",
|
|
52
|
+
"question": row[q],
|
|
53
|
+
"answer": row[a],
|
|
54
|
+
"page_number": idx,
|
|
55
|
+
"source_type": self._source_type,
|
|
56
|
+
"type": "QA",
|
|
57
|
+
"summary": f"Question: {row[q]}?: **{row[a]}**",
|
|
58
|
+
"document_meta": {
|
|
59
|
+
"question": row[q],
|
|
60
|
+
"answer": row[a],
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
docs.append(doc)
|
|
65
|
+
return docs
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
def load(self, **kwargs) -> list:
|
|
69
|
+
"""
|
|
70
|
+
Load Chapters from a PDF file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
list: A list of Langchain Documents.
|
|
74
|
+
"""
|
|
75
|
+
if self.path.is_file():
|
|
76
|
+
documents = self._load_document(path=self.path, **kwargs)
|
|
77
|
+
# after all documents are retrieved, procesed and stored
|
|
78
|
+
return self.split_documents(documents)
|
|
79
|
+
|
|
80
|
+
def parse(self, source):
|
|
81
|
+
pass
|
parrot/loaders/repo.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_core.document_loaders.blob_loaders import Blob
|
|
3
|
+
from langchain_community.document_loaders.generic import GenericLoader
|
|
4
|
+
from langchain_community.document_loaders.parsers import LanguageParser
|
|
5
|
+
from langchain_community.document_loaders import (
|
|
6
|
+
DirectoryLoader,
|
|
7
|
+
TextLoader,
|
|
8
|
+
JSONLoader
|
|
9
|
+
)
|
|
10
|
+
from langchain_text_splitters import Language
|
|
11
|
+
from langchain.text_splitter import (
|
|
12
|
+
RecursiveCharacterTextSplitter
|
|
13
|
+
)
|
|
14
|
+
from .abstract import AbstractLoader
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RepositoryLoader(AbstractLoader):
|
|
18
|
+
"""Repository (Code Directory) loader.
|
|
19
|
+
"""
|
|
20
|
+
exclude_paths: list = [
|
|
21
|
+
".venv/**",
|
|
22
|
+
".venv/**/**/*",
|
|
23
|
+
".git/**",
|
|
24
|
+
"node_modules/**",
|
|
25
|
+
"build/**",
|
|
26
|
+
"dist/**",
|
|
27
|
+
"templates/**",
|
|
28
|
+
"tmp/**"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
def load(self, path: PurePath, lang: str = 'python', excludes: list = []) -> list:
|
|
32
|
+
"""
|
|
33
|
+
Load data from a repository and return it as a Langchain Document.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(path, str):
|
|
36
|
+
path = PurePath(path)
|
|
37
|
+
if excludes:
|
|
38
|
+
self.exclude_paths += excludes
|
|
39
|
+
excludes_path = [
|
|
40
|
+
str(path.joinpath(p).resolve()) for p in self.exclude_paths
|
|
41
|
+
]
|
|
42
|
+
if lang == 'python':
|
|
43
|
+
parser = LanguageParser(language=Language.PYTHON, parser_threshold=100)
|
|
44
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
45
|
+
language=Language.PYTHON, chunk_size=1024, chunk_overlap=200
|
|
46
|
+
)
|
|
47
|
+
suffixes = [".py", ".pyx"]
|
|
48
|
+
glob = "**/[!.]*.py?"
|
|
49
|
+
elif lang == 'javascript':
|
|
50
|
+
parser = LanguageParser(language=Language.JS, parser_threshold=100)
|
|
51
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
52
|
+
language=Language.JS, chunk_size=1024, chunk_overlap=200
|
|
53
|
+
)
|
|
54
|
+
suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
55
|
+
elif lang == 'typescript':
|
|
56
|
+
parser = LanguageParser(language=Language.TS, parser_threshold=100)
|
|
57
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
58
|
+
language=Language.TS, chunk_size=1024, chunk_overlap=200
|
|
59
|
+
)
|
|
60
|
+
suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
61
|
+
elif lang == 'json':
|
|
62
|
+
loader = DirectoryLoader(
|
|
63
|
+
path,
|
|
64
|
+
glob="**/*.json",
|
|
65
|
+
show_progress=True,
|
|
66
|
+
exclude=excludes_path,
|
|
67
|
+
silent_errors=True,
|
|
68
|
+
recursive=True,
|
|
69
|
+
# loader_cls=TextLoader,
|
|
70
|
+
loader_cls=JSONLoader,
|
|
71
|
+
loader_kwargs={
|
|
72
|
+
'jq_schema': '.',
|
|
73
|
+
'text_content': False
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
docs = loader.load()
|
|
77
|
+
for doc in docs:
|
|
78
|
+
doc.metadata['url'] = ''
|
|
79
|
+
doc.metadata['source_type'] = self._source_type
|
|
80
|
+
doc.metadata['language'] = lang
|
|
81
|
+
return self.text_splitter.split_documents(docs)
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Language {lang} not supported for Repository"
|
|
85
|
+
)
|
|
86
|
+
loader = GenericLoader.from_filesystem(
|
|
87
|
+
path,
|
|
88
|
+
glob=glob,
|
|
89
|
+
suffixes=suffixes,
|
|
90
|
+
exclude=self.exclude_paths,
|
|
91
|
+
parser=parser,
|
|
92
|
+
show_progress=True
|
|
93
|
+
)
|
|
94
|
+
docs = loader.load()
|
|
95
|
+
for doc in docs:
|
|
96
|
+
doc.metadata['url'] = ''
|
|
97
|
+
doc.metadata['source_type'] = self._source_type
|
|
98
|
+
doc.metadata['language'] = lang
|
|
99
|
+
documents = splitter.split_documents(docs)
|
|
100
|
+
return documents
|
|
101
|
+
|
|
102
|
+
def parse(self, source):
|
|
103
|
+
raise NotImplementedError("Parser method is not implemented for PDFLoader.")
|
parrot/loaders/rtd.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
import re
|
|
5
|
+
from langchain_community.document_loaders import ReadTheDocsLoader as RTLoader
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ReadTheDocsLoader(AbstractLoader):
|
|
10
|
+
"""
|
|
11
|
+
Loading documents from ReadTheDocs.
|
|
12
|
+
"""
|
|
13
|
+
_extension: list = []
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'documentation',
|
|
21
|
+
encoding: str = 'utf-8',
|
|
22
|
+
origin: str = '',
|
|
23
|
+
**kwargs
|
|
24
|
+
):
|
|
25
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
26
|
+
self.path = path
|
|
27
|
+
self.encoding = encoding
|
|
28
|
+
self.origin = origin
|
|
29
|
+
self._prefix = ''
|
|
30
|
+
match = re.search(r'://([^/]+)', origin)
|
|
31
|
+
if match:
|
|
32
|
+
self._prefix = match.group(1)
|
|
33
|
+
|
|
34
|
+
def load(self) -> list:
|
|
35
|
+
documents = []
|
|
36
|
+
self.logger.info(
|
|
37
|
+
f"Loading ReadTheDocs from: {self.path}"
|
|
38
|
+
)
|
|
39
|
+
rt_loader = RTLoader(path=self.path, encoding=self.encoding)
|
|
40
|
+
documents = rt_loader.load()
|
|
41
|
+
for doc in documents:
|
|
42
|
+
src = doc.metadata.get('source')
|
|
43
|
+
suffix = src.replace(f'{self.path}', '').replace(self._prefix, '')
|
|
44
|
+
if suffix.startswith('//'):
|
|
45
|
+
suffix = suffix[1:]
|
|
46
|
+
url = f"{self.origin}{suffix}"
|
|
47
|
+
metadata = {
|
|
48
|
+
"source": url,
|
|
49
|
+
"url": url,
|
|
50
|
+
"index": suffix,
|
|
51
|
+
"filename": src,
|
|
52
|
+
"question": '',
|
|
53
|
+
"answer": '',
|
|
54
|
+
'type': 'documentation',
|
|
55
|
+
"source_type": self._source_type,
|
|
56
|
+
"summary": '',
|
|
57
|
+
"document_meta": {
|
|
58
|
+
**doc.metadata
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
doc.metadata = metadata
|
|
62
|
+
return documents
|
|
63
|
+
|
|
64
|
+
def parse(self, source):
|
|
65
|
+
pass
|
parrot/loaders/txt.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
from langchain_community.document_loaders import TextLoader
|
|
5
|
+
from langchain.docstore.document import Document
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TXTLoader(AbstractLoader):
|
|
10
|
+
"""
|
|
11
|
+
Loader for PDF files.
|
|
12
|
+
"""
|
|
13
|
+
_extension = ['.txt']
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
path: PurePath,
|
|
18
|
+
tokenizer: Callable[..., Any] = None,
|
|
19
|
+
text_splitter: Callable[..., Any] = None,
|
|
20
|
+
source_type: str = 'text',
|
|
21
|
+
**kwargs
|
|
22
|
+
):
|
|
23
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
24
|
+
self.path = path
|
|
25
|
+
if isinstance(path, str):
|
|
26
|
+
self.path = Path(path).resolve()
|
|
27
|
+
|
|
28
|
+
def _load_document(self, path: Path) -> list:
|
|
29
|
+
"""
|
|
30
|
+
Load a TXT file.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
path (Path): The path to the TXT file.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
list: A list of Langchain Documents.
|
|
37
|
+
"""
|
|
38
|
+
if self._check_path(path):
|
|
39
|
+
self.logger.info(f"Loading TXT file: {path}")
|
|
40
|
+
with open(path, 'r') as file:
|
|
41
|
+
text = file.read()
|
|
42
|
+
return [
|
|
43
|
+
Document(
|
|
44
|
+
page_content=text,
|
|
45
|
+
metadata={
|
|
46
|
+
"url": '',
|
|
47
|
+
"index": str(path.name),
|
|
48
|
+
"source": str(path),
|
|
49
|
+
"filename": str(path.name),
|
|
50
|
+
"summary": '',
|
|
51
|
+
"question": '',
|
|
52
|
+
"answer": '',
|
|
53
|
+
'type': 'text',
|
|
54
|
+
"source_type": self._source_type,
|
|
55
|
+
"document_meta": {}
|
|
56
|
+
}
|
|
57
|
+
)
|
|
58
|
+
]
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
def load(self) -> list:
|
|
62
|
+
"""
|
|
63
|
+
Load data from a TXT file.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
source (str): The path to the TXT file.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
list: A list of Langchain Documents.
|
|
70
|
+
"""
|
|
71
|
+
if not self.path.exists():
|
|
72
|
+
raise FileNotFoundError(
|
|
73
|
+
f"File/directory not found: {self.path}"
|
|
74
|
+
)
|
|
75
|
+
if self.path.is_dir():
|
|
76
|
+
documents = []
|
|
77
|
+
# iterate over the files in the directory
|
|
78
|
+
for ext in self._extension:
|
|
79
|
+
for item in self.path.glob(f'*{ext}'):
|
|
80
|
+
documents.extend(self._load_document(item))
|
|
81
|
+
elif self.path.is_file():
|
|
82
|
+
documents = self._load_document(self.path)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"TXT Loader: Invalid path: {self.path}"
|
|
86
|
+
)
|
|
87
|
+
return self.split_documents(documents)
|
|
88
|
+
|
|
89
|
+
def parse(self, source):
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
"Parser method is not implemented for TXTLoader."
|
|
92
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .models import BotData
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from datamodel import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
def uuid_to_str(obj) -> str:
|
|
7
|
+
return str(obj)
|
|
8
|
+
|
|
9
|
+
class BotData(BaseModel):
|
|
10
|
+
chatbot_id: str = Field(primary_key=True, required=True, encoder=uuid_to_str)
|
|
11
|
+
name: str = Field(required=True)
|
|
12
|
+
source_type: str = Field(required=True, default='content')
|
|
13
|
+
category: str = Field(required=True, default='data')
|
|
14
|
+
tags: Optional[list[str]] = Field(required=False, default_factory=list)
|
|
15
|
+
document_type: str = Field(required=False, default='document')
|
|
16
|
+
loader: str = Field(required=True, default='TXTLoader')
|
|
17
|
+
source_path: Union[str,PurePath] = Field(required=False)
|
|
18
|
+
extensions: list[str] = Field(required=False)
|
|
19
|
+
data: Optional[Union[list,dict]] = Field(required=False)
|
|
20
|
+
arguments: Optional[dict] = Field(default_factory=dict)
|
|
21
|
+
version: int = Field(required=True, default=1)
|
|
22
|
+
updated_at: datetime = Field(required=False, default=datetime.now)
|
|
23
|
+
|
|
24
|
+
class Meta:
|
|
25
|
+
name: str = 'chatbots_data'
|
parrot/loaders/video.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Union, List
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import subprocess
|
|
6
|
+
from .basevideo import BaseVideoLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VideoLoader(BaseVideoLoader):
|
|
10
|
+
"""
|
|
11
|
+
Generating Video transcripts from Videos.
|
|
12
|
+
"""
|
|
13
|
+
_extension = ['.youtube']
|
|
14
|
+
encoding = 'utf-8'
|
|
15
|
+
chunk_size = 768
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
urls: List[str],
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'video',
|
|
23
|
+
language: str = "en",
|
|
24
|
+
video_path: Union[str, Path] = None,
|
|
25
|
+
**kwargs
|
|
26
|
+
):
|
|
27
|
+
super().__init__(
|
|
28
|
+
urls,
|
|
29
|
+
tokenizer,
|
|
30
|
+
text_splitter,
|
|
31
|
+
source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
video_path=video_path,
|
|
34
|
+
**kwargs
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def download_video(self, url: str, path: str) -> Path:
|
|
38
|
+
"""
|
|
39
|
+
Downloads a video from a URL using yt-dlp.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
video_url (str): The URL of the video to download.
|
|
43
|
+
output_path (str): The directory where the video will be saved.
|
|
44
|
+
"""
|
|
45
|
+
command = [
|
|
46
|
+
"yt-dlp",
|
|
47
|
+
"--get-filename",
|
|
48
|
+
url
|
|
49
|
+
]
|
|
50
|
+
try:
|
|
51
|
+
result = subprocess.run(command, check=True, stdout=subprocess.PIPE, text=True)
|
|
52
|
+
filename = result.stdout.strip() # Remove any trailing newline characters
|
|
53
|
+
file_path = path.joinpath(filename)
|
|
54
|
+
if file_path.exists():
|
|
55
|
+
print(f"Video already downloaded: {filename}")
|
|
56
|
+
return file_path
|
|
57
|
+
print(f"Downloading video: {filename}")
|
|
58
|
+
# after extracted filename, download the video
|
|
59
|
+
command = [
|
|
60
|
+
"yt-dlp",
|
|
61
|
+
url,
|
|
62
|
+
"-o",
|
|
63
|
+
str(file_path)
|
|
64
|
+
]
|
|
65
|
+
subprocess.run(command, check=True)
|
|
66
|
+
return file_path
|
|
67
|
+
except subprocess.CalledProcessError as e:
|
|
68
|
+
print(f"Error downloading video: {e}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def load(self) -> list:
|
|
72
|
+
documents = []
|
|
73
|
+
for url in self.urls:
|
|
74
|
+
transcript = None
|
|
75
|
+
if isinstance(url, dict):
|
|
76
|
+
path = list(url.keys())[0]
|
|
77
|
+
parts = url[path]
|
|
78
|
+
if isinstance(parts, str):
|
|
79
|
+
video_title = parts
|
|
80
|
+
elif isinstance(parts, dict):
|
|
81
|
+
video_title = parts['title']
|
|
82
|
+
transcript = parts.get('transcript', None)
|
|
83
|
+
url = path
|
|
84
|
+
else:
|
|
85
|
+
video_title = url
|
|
86
|
+
docs = self.load_video(url, video_title, transcript)
|
|
87
|
+
documents.extend(docs)
|
|
88
|
+
# return documents
|
|
89
|
+
return self.split_documents(documents)
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def load_video(self, url: str, video_title: str, transcript: str) -> list:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
def parse(self, source):
|
|
96
|
+
pass
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from langchain.docstore.document import Document
|
|
5
|
+
from .basevideo import BaseVideoLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VideoLocalLoader(BaseVideoLoader):
|
|
9
|
+
"""
|
|
10
|
+
Generating Video transcripts from local Videos.
|
|
11
|
+
"""
|
|
12
|
+
_extension = ['.mp4', '.webm']
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
path: PurePath,
|
|
17
|
+
tokenizer: Callable[..., Any] = None,
|
|
18
|
+
text_splitter: Callable[..., Any] = None,
|
|
19
|
+
source_type: str = 'documentation',
|
|
20
|
+
encoding: str = 'utf-8',
|
|
21
|
+
origin: str = '',
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
|
|
25
|
+
self.path = path
|
|
26
|
+
|
|
27
|
+
def load_video(self, path: PurePath) -> list:
|
|
28
|
+
metadata = {
|
|
29
|
+
"source": f"{path}",
|
|
30
|
+
"url": f"{path.name}",
|
|
31
|
+
"index": path.stem,
|
|
32
|
+
"filename": f"{path}",
|
|
33
|
+
"question": '',
|
|
34
|
+
"answer": '',
|
|
35
|
+
'type': 'video_transcript',
|
|
36
|
+
"source_type": self._source_type,
|
|
37
|
+
"summary": '',
|
|
38
|
+
"document_meta": {
|
|
39
|
+
"language": self._language,
|
|
40
|
+
"topic_tags": ""
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
documents = []
|
|
44
|
+
transcript_path = path.with_suffix('.vtt')
|
|
45
|
+
audio_path = path.with_suffix('.mp3')
|
|
46
|
+
# second: extract audio from File
|
|
47
|
+
self.extract_audio(path, audio_path)
|
|
48
|
+
# get the Whisper parser
|
|
49
|
+
transcript_whisper = self.get_whisper_transcript(audio_path)
|
|
50
|
+
if transcript_whisper:
|
|
51
|
+
transcript = transcript_whisper['text']
|
|
52
|
+
else:
|
|
53
|
+
transcript = ''
|
|
54
|
+
# Summarize the transcript
|
|
55
|
+
if transcript:
|
|
56
|
+
summary = self.get_summary_from_text(transcript)
|
|
57
|
+
# Create Two Documents, one is for transcript, second is VTT:
|
|
58
|
+
metadata['summary'] = summary
|
|
59
|
+
doc = Document(
|
|
60
|
+
page_content=transcript,
|
|
61
|
+
metadata=metadata
|
|
62
|
+
)
|
|
63
|
+
documents.append(doc)
|
|
64
|
+
if transcript_whisper:
|
|
65
|
+
# VTT version:
|
|
66
|
+
transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
|
|
67
|
+
doc = Document(
|
|
68
|
+
page_content=transcript,
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
documents.append(doc)
|
|
72
|
+
# Saving every dialog chunk as a separate document
|
|
73
|
+
dialogs = self.transcript_to_blocks(transcript_whisper)
|
|
74
|
+
docs = []
|
|
75
|
+
for chunk in dialogs:
|
|
76
|
+
_meta = {
|
|
77
|
+
"index": f"{path.stem}:{chunk['id']}",
|
|
78
|
+
"document_meta": {
|
|
79
|
+
"start": f"{chunk['start_time']}",
|
|
80
|
+
"end": f"{chunk['end_time']}",
|
|
81
|
+
"id": f"{chunk['id']}",
|
|
82
|
+
"language": self._language,
|
|
83
|
+
"title": f"{path.stem}",
|
|
84
|
+
"topic_tags": ""
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
_info = {**metadata, **_meta}
|
|
88
|
+
doc = Document(
|
|
89
|
+
page_content=chunk['text'],
|
|
90
|
+
metadata=_info
|
|
91
|
+
)
|
|
92
|
+
docs.append(doc)
|
|
93
|
+
documents.extend(docs)
|
|
94
|
+
return documents
|
|
95
|
+
|
|
96
|
+
def load(self) -> list:
|
|
97
|
+
documents = []
|
|
98
|
+
if self.path.is_file():
|
|
99
|
+
docs = self.load_video(self.path)
|
|
100
|
+
documents.extend(docs)
|
|
101
|
+
if self.path.is_dir():
|
|
102
|
+
# iterate over the files in the directory
|
|
103
|
+
for ext in self._extension:
|
|
104
|
+
for item in self.path.glob(f'*{ext}'):
|
|
105
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
106
|
+
documents.extend(self.load_video(item))
|
|
107
|
+
return self.split_documents(documents)
|
|
108
|
+
|
|
109
|
+
def extract(self) -> list:
|
|
110
|
+
documents = []
|
|
111
|
+
if self.path.is_file():
|
|
112
|
+
docs = self.load_video(self.path)
|
|
113
|
+
documents.extend(docs)
|
|
114
|
+
if self.path.is_dir():
|
|
115
|
+
# iterate over the files in the directory
|
|
116
|
+
for ext in self._extension:
|
|
117
|
+
for item in self.path.glob(f'*{ext}'):
|
|
118
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
119
|
+
documents.extend(self.load_video(item))
|
|
120
|
+
return documents
|