ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (109) hide show
  1. ai_parrot-0.3.4.dist-info/LICENSE +21 -0
  2. ai_parrot-0.3.4.dist-info/METADATA +319 -0
  3. ai_parrot-0.3.4.dist-info/RECORD +109 -0
  4. ai_parrot-0.3.4.dist-info/WHEEL +6 -0
  5. ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +21 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +728 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +366 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +83 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/odoo.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +578 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +110 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +162 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +137 -0
  40. parrot/llms/abstract.py +47 -0
  41. parrot/llms/anthropic.py +42 -0
  42. parrot/llms/google.py +42 -0
  43. parrot/llms/groq.py +45 -0
  44. parrot/llms/hf.py +45 -0
  45. parrot/llms/openai.py +59 -0
  46. parrot/llms/pipes.py +114 -0
  47. parrot/llms/vertex.py +78 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/audio.py +106 -0
  51. parrot/loaders/basepdf.py +102 -0
  52. parrot/loaders/basevideo.py +280 -0
  53. parrot/loaders/csv.py +42 -0
  54. parrot/loaders/dir.py +37 -0
  55. parrot/loaders/excel.py +349 -0
  56. parrot/loaders/github.py +65 -0
  57. parrot/loaders/handlers/__init__.py +5 -0
  58. parrot/loaders/handlers/data.py +213 -0
  59. parrot/loaders/image.py +119 -0
  60. parrot/loaders/json.py +52 -0
  61. parrot/loaders/pdf.py +437 -0
  62. parrot/loaders/pdfchapters.py +142 -0
  63. parrot/loaders/pdffn.py +112 -0
  64. parrot/loaders/pdfimages.py +207 -0
  65. parrot/loaders/pdfmark.py +88 -0
  66. parrot/loaders/pdftables.py +145 -0
  67. parrot/loaders/ppt.py +30 -0
  68. parrot/loaders/qa.py +81 -0
  69. parrot/loaders/repo.py +103 -0
  70. parrot/loaders/rtd.py +65 -0
  71. parrot/loaders/txt.py +92 -0
  72. parrot/loaders/utils/__init__.py +1 -0
  73. parrot/loaders/utils/models.py +25 -0
  74. parrot/loaders/video.py +96 -0
  75. parrot/loaders/videolocal.py +120 -0
  76. parrot/loaders/vimeo.py +106 -0
  77. parrot/loaders/web.py +216 -0
  78. parrot/loaders/web_base.py +112 -0
  79. parrot/loaders/word.py +125 -0
  80. parrot/loaders/youtube.py +192 -0
  81. parrot/manager.py +166 -0
  82. parrot/models.py +372 -0
  83. parrot/py.typed +0 -0
  84. parrot/stores/__init__.py +48 -0
  85. parrot/stores/abstract.py +171 -0
  86. parrot/stores/milvus.py +632 -0
  87. parrot/stores/qdrant.py +153 -0
  88. parrot/tools/__init__.py +12 -0
  89. parrot/tools/abstract.py +53 -0
  90. parrot/tools/asknews.py +32 -0
  91. parrot/tools/bing.py +13 -0
  92. parrot/tools/duck.py +62 -0
  93. parrot/tools/google.py +170 -0
  94. parrot/tools/stack.py +26 -0
  95. parrot/tools/weather.py +70 -0
  96. parrot/tools/wikipedia.py +59 -0
  97. parrot/tools/zipcode.py +179 -0
  98. parrot/utils/__init__.py +2 -0
  99. parrot/utils/parsers/__init__.py +5 -0
  100. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  101. parrot/utils/toml.py +11 -0
  102. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  103. parrot/utils/uv.py +11 -0
  104. parrot/version.py +10 -0
  105. resources/users/__init__.py +5 -0
  106. resources/users/handlers.py +13 -0
  107. resources/users/models.py +205 -0
  108. settings/__init__.py +0 -0
  109. settings/settings.py +51 -0
parrot/loaders/ppt.py ADDED
@@ -0,0 +1,30 @@
1
+ from pathlib import PurePath
2
+ from langchain_community.document_loaders import (
3
+ UnstructuredPowerPointLoader
4
+ )
5
+ from .abstract import AbstractLoader
6
+
7
+
8
+ class PPTXLoader(AbstractLoader):
9
+ """
10
+ Loader for PPTX files.
11
+ """
12
+ _extension: list = ['.pptx']
13
+
14
+ def load(self, path: PurePath) -> list:
15
+ if self._check_path(path):
16
+ docs = []
17
+ self.logger.info(f"Loading PPTX file: {path}")
18
+ ppt_loader = UnstructuredPowerPointLoader(
19
+ file_path=str(path)
20
+ )
21
+ docs += ppt_loader.load()
22
+ for doc in docs:
23
+ doc.metadata['source_type'] = self._source_type
24
+ # Split the documents into chunks
25
+ return self.split_documents(docs)
26
+ else:
27
+ return []
28
+
29
+ def parse(self, source):
30
+ pass
parrot/loaders/qa.py ADDED
@@ -0,0 +1,81 @@
1
+
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ from collections.abc import Callable
5
+ import pandas as pd
6
+ from langchain.docstore.document import Document
7
+ from .abstract import AbstractLoader
8
+
9
+
10
+ class QAFileLoader(AbstractLoader):
11
+ """
12
+ Question and Answers File based on Excel.
13
+ """
14
+ _extension = ['.xlsx']
15
+ chunk_size = 768
16
+
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'QA',
23
+ columns: list = ['Question', 'Answer'],
24
+ **kwargs
25
+ ):
26
+ super().__init__(tokenizer, text_splitter, source_type, **kwargs)
27
+ self.path = path
28
+ self._columns = columns
29
+ if isinstance(path, str):
30
+ self.path = Path(path).resolve()
31
+ if self.path.is_dir():
32
+ raise ValueError(
33
+ f"Currently only accepting single Files."
34
+ )
35
+
36
+ def _load_document(self, path: PurePath) -> list:
37
+ if path.exists():
38
+ print('Load QA Excel File: ', path)
39
+ df = pd.read_excel(path)
40
+ q = self._columns[0]
41
+ a = self._columns[1]
42
+ docs = []
43
+ for idx, row in df.iterrows():
44
+ # Question Document
45
+ doc = Document(
46
+ page_content=f"**Question:** {row[q]}: **Answer:** {row[a]}",
47
+ metadata={
48
+ "url": '',
49
+ "index": f"{path.name} #{idx}",
50
+ "source": f"{path.name} Row.#{idx}",
51
+ "filename": f"{path.name}",
52
+ "question": row[q],
53
+ "answer": row[a],
54
+ "page_number": idx,
55
+ "source_type": self._source_type,
56
+ "type": "QA",
57
+ "summary": f"Question: {row[q]}?: **{row[a]}**",
58
+ "document_meta": {
59
+ "question": row[q],
60
+ "answer": row[a],
61
+ }
62
+ }
63
+ )
64
+ docs.append(doc)
65
+ return docs
66
+ return []
67
+
68
+ def load(self, **kwargs) -> list:
69
+ """
70
+ Load Chapters from a PDF file.
71
+
72
+ Returns:
73
+ list: A list of Langchain Documents.
74
+ """
75
+ if self.path.is_file():
76
+ documents = self._load_document(path=self.path, **kwargs)
77
+ # after all documents are retrieved, procesed and stored
78
+ return self.split_documents(documents)
79
+
80
+ def parse(self, source):
81
+ pass
parrot/loaders/repo.py ADDED
@@ -0,0 +1,103 @@
1
+ from pathlib import PurePath
2
+ from langchain_core.document_loaders.blob_loaders import Blob
3
+ from langchain_community.document_loaders.generic import GenericLoader
4
+ from langchain_community.document_loaders.parsers import LanguageParser
5
+ from langchain_community.document_loaders import (
6
+ DirectoryLoader,
7
+ TextLoader,
8
+ JSONLoader
9
+ )
10
+ from langchain_text_splitters import Language
11
+ from langchain.text_splitter import (
12
+ RecursiveCharacterTextSplitter
13
+ )
14
+ from .abstract import AbstractLoader
15
+
16
+
17
+ class RepositoryLoader(AbstractLoader):
18
+ """Repository (Code Directory) loader.
19
+ """
20
+ exclude_paths: list = [
21
+ ".venv/**",
22
+ ".venv/**/**/*",
23
+ ".git/**",
24
+ "node_modules/**",
25
+ "build/**",
26
+ "dist/**",
27
+ "templates/**",
28
+ "tmp/**"
29
+ ]
30
+
31
+ def load(self, path: PurePath, lang: str = 'python', excludes: list = []) -> list:
32
+ """
33
+ Load data from a repository and return it as a Langchain Document.
34
+ """
35
+ if isinstance(path, str):
36
+ path = PurePath(path)
37
+ if excludes:
38
+ self.exclude_paths += excludes
39
+ excludes_path = [
40
+ str(path.joinpath(p).resolve()) for p in self.exclude_paths
41
+ ]
42
+ if lang == 'python':
43
+ parser = LanguageParser(language=Language.PYTHON, parser_threshold=100)
44
+ splitter = RecursiveCharacterTextSplitter.from_language(
45
+ language=Language.PYTHON, chunk_size=1024, chunk_overlap=200
46
+ )
47
+ suffixes = [".py", ".pyx"]
48
+ glob = "**/[!.]*.py?"
49
+ elif lang == 'javascript':
50
+ parser = LanguageParser(language=Language.JS, parser_threshold=100)
51
+ splitter = RecursiveCharacterTextSplitter.from_language(
52
+ language=Language.JS, chunk_size=1024, chunk_overlap=200
53
+ )
54
+ suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
55
+ elif lang == 'typescript':
56
+ parser = LanguageParser(language=Language.TS, parser_threshold=100)
57
+ splitter = RecursiveCharacterTextSplitter.from_language(
58
+ language=Language.TS, chunk_size=1024, chunk_overlap=200
59
+ )
60
+ suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
61
+ elif lang == 'json':
62
+ loader = DirectoryLoader(
63
+ path,
64
+ glob="**/*.json",
65
+ show_progress=True,
66
+ exclude=excludes_path,
67
+ silent_errors=True,
68
+ recursive=True,
69
+ # loader_cls=TextLoader,
70
+ loader_cls=JSONLoader,
71
+ loader_kwargs={
72
+ 'jq_schema': '.',
73
+ 'text_content': False
74
+ }
75
+ )
76
+ docs = loader.load()
77
+ for doc in docs:
78
+ doc.metadata['url'] = ''
79
+ doc.metadata['source_type'] = self._source_type
80
+ doc.metadata['language'] = lang
81
+ return self.text_splitter.split_documents(docs)
82
+ else:
83
+ raise ValueError(
84
+ f"Language {lang} not supported for Repository"
85
+ )
86
+ loader = GenericLoader.from_filesystem(
87
+ path,
88
+ glob=glob,
89
+ suffixes=suffixes,
90
+ exclude=self.exclude_paths,
91
+ parser=parser,
92
+ show_progress=True
93
+ )
94
+ docs = loader.load()
95
+ for doc in docs:
96
+ doc.metadata['url'] = ''
97
+ doc.metadata['source_type'] = self._source_type
98
+ doc.metadata['language'] = lang
99
+ documents = splitter.split_documents(docs)
100
+ return documents
101
+
102
+ def parse(self, source):
103
+ raise NotImplementedError("Parser method is not implemented for PDFLoader.")
parrot/loaders/rtd.py ADDED
@@ -0,0 +1,65 @@
1
+ from collections.abc import Callable
2
+ from pathlib import PurePath
3
+ from typing import Any
4
+ import re
5
+ from langchain_community.document_loaders import ReadTheDocsLoader as RTLoader
6
+ from .abstract import AbstractLoader
7
+
8
+
9
+ class ReadTheDocsLoader(AbstractLoader):
10
+ """
11
+ Loading documents from ReadTheDocs.
12
+ """
13
+ _extension: list = []
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'documentation',
21
+ encoding: str = 'utf-8',
22
+ origin: str = '',
23
+ **kwargs
24
+ ):
25
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
26
+ self.path = path
27
+ self.encoding = encoding
28
+ self.origin = origin
29
+ self._prefix = ''
30
+ match = re.search(r'://([^/]+)', origin)
31
+ if match:
32
+ self._prefix = match.group(1)
33
+
34
+ def load(self) -> list:
35
+ documents = []
36
+ self.logger.info(
37
+ f"Loading ReadTheDocs from: {self.path}"
38
+ )
39
+ rt_loader = RTLoader(path=self.path, encoding=self.encoding)
40
+ documents = rt_loader.load()
41
+ for doc in documents:
42
+ src = doc.metadata.get('source')
43
+ suffix = src.replace(f'{self.path}', '').replace(self._prefix, '')
44
+ if suffix.startswith('//'):
45
+ suffix = suffix[1:]
46
+ url = f"{self.origin}{suffix}"
47
+ metadata = {
48
+ "source": url,
49
+ "url": url,
50
+ "index": suffix,
51
+ "filename": src,
52
+ "question": '',
53
+ "answer": '',
54
+ 'type': 'documentation',
55
+ "source_type": self._source_type,
56
+ "summary": '',
57
+ "document_meta": {
58
+ **doc.metadata
59
+ }
60
+ }
61
+ doc.metadata = metadata
62
+ return documents
63
+
64
+ def parse(self, source):
65
+ pass
parrot/loaders/txt.py ADDED
@@ -0,0 +1,92 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain.docstore.document import Document
6
+ from .abstract import AbstractLoader
7
+
8
+
9
+ class TXTLoader(AbstractLoader):
10
+ """
11
+ Loader for PDF files.
12
+ """
13
+ _extension = ['.txt']
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'text',
21
+ **kwargs
22
+ ):
23
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
24
+ self.path = path
25
+ if isinstance(path, str):
26
+ self.path = Path(path).resolve()
27
+
28
+ def _load_document(self, path: Path) -> list:
29
+ """
30
+ Load a TXT file.
31
+
32
+ Args:
33
+ path (Path): The path to the TXT file.
34
+
35
+ Returns:
36
+ list: A list of Langchain Documents.
37
+ """
38
+ if self._check_path(path):
39
+ self.logger.info(f"Loading TXT file: {path}")
40
+ with open(path, 'r') as file:
41
+ text = file.read()
42
+ return [
43
+ Document(
44
+ page_content=text,
45
+ metadata={
46
+ "url": '',
47
+ "index": str(path.name),
48
+ "source": str(path),
49
+ "filename": str(path.name),
50
+ "summary": '',
51
+ "question": '',
52
+ "answer": '',
53
+ 'type': 'text',
54
+ "source_type": self._source_type,
55
+ "document_meta": {}
56
+ }
57
+ )
58
+ ]
59
+ return []
60
+
61
+ def load(self) -> list:
62
+ """
63
+ Load data from a TXT file.
64
+
65
+ Args:
66
+ source (str): The path to the TXT file.
67
+
68
+ Returns:
69
+ list: A list of Langchain Documents.
70
+ """
71
+ if not self.path.exists():
72
+ raise FileNotFoundError(
73
+ f"File/directory not found: {self.path}"
74
+ )
75
+ if self.path.is_dir():
76
+ documents = []
77
+ # iterate over the files in the directory
78
+ for ext in self._extension:
79
+ for item in self.path.glob(f'*{ext}'):
80
+ documents.extend(self._load_document(item))
81
+ elif self.path.is_file():
82
+ documents = self._load_document(self.path)
83
+ else:
84
+ raise ValueError(
85
+ f"TXT Loader: Invalid path: {self.path}"
86
+ )
87
+ return self.split_documents(documents)
88
+
89
+ def parse(self, source):
90
+ raise NotImplementedError(
91
+ "Parser method is not implemented for TXTLoader."
92
+ )
@@ -0,0 +1 @@
1
+ from .models import BotData
@@ -0,0 +1,25 @@
1
+ from typing import Optional, Union
2
+ from datetime import datetime
3
+ from pathlib import PurePath
4
+ from datamodel import BaseModel, Field
5
+
6
+ def uuid_to_str(obj) -> str:
7
+ return str(obj)
8
+
9
+ class BotData(BaseModel):
10
+ chatbot_id: str = Field(primary_key=True, required=True, encoder=uuid_to_str)
11
+ name: str = Field(required=True)
12
+ source_type: str = Field(required=True, default='content')
13
+ category: str = Field(required=True, default='data')
14
+ tags: Optional[list[str]] = Field(required=False, default_factory=list)
15
+ document_type: str = Field(required=False, default='document')
16
+ loader: str = Field(required=True, default='TXTLoader')
17
+ source_path: Union[str,PurePath] = Field(required=False)
18
+ extensions: list[str] = Field(required=False)
19
+ data: Optional[Union[list,dict]] = Field(required=False)
20
+ arguments: Optional[dict] = Field(default_factory=dict)
21
+ version: int = Field(required=True, default=1)
22
+ updated_at: datetime = Field(required=False, default=datetime.now)
23
+
24
+ class Meta:
25
+ name: str = 'chatbots_data'
@@ -0,0 +1,96 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Union, List
3
+ from abc import abstractmethod
4
+ from pathlib import Path
5
+ import subprocess
6
+ from .basevideo import BaseVideoLoader
7
+
8
+
9
+ class VideoLoader(BaseVideoLoader):
10
+ """
11
+ Generating Video transcripts from Videos.
12
+ """
13
+ _extension = ['.youtube']
14
+ encoding = 'utf-8'
15
+ chunk_size = 768
16
+
17
+ def __init__(
18
+ self,
19
+ urls: List[str],
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'video',
23
+ language: str = "en",
24
+ video_path: Union[str, Path] = None,
25
+ **kwargs
26
+ ):
27
+ super().__init__(
28
+ urls,
29
+ tokenizer,
30
+ text_splitter,
31
+ source_type,
32
+ language=language,
33
+ video_path=video_path,
34
+ **kwargs
35
+ )
36
+
37
+ def download_video(self, url: str, path: str) -> Path:
38
+ """
39
+ Downloads a video from a URL using yt-dlp.
40
+
41
+ Args:
42
+ video_url (str): The URL of the video to download.
43
+ output_path (str): The directory where the video will be saved.
44
+ """
45
+ command = [
46
+ "yt-dlp",
47
+ "--get-filename",
48
+ url
49
+ ]
50
+ try:
51
+ result = subprocess.run(command, check=True, stdout=subprocess.PIPE, text=True)
52
+ filename = result.stdout.strip() # Remove any trailing newline characters
53
+ file_path = path.joinpath(filename)
54
+ if file_path.exists():
55
+ print(f"Video already downloaded: {filename}")
56
+ return file_path
57
+ print(f"Downloading video: {filename}")
58
+ # after extracted filename, download the video
59
+ command = [
60
+ "yt-dlp",
61
+ url,
62
+ "-o",
63
+ str(file_path)
64
+ ]
65
+ subprocess.run(command, check=True)
66
+ return file_path
67
+ except subprocess.CalledProcessError as e:
68
+ print(f"Error downloading video: {e}")
69
+
70
+
71
+ def load(self) -> list:
72
+ documents = []
73
+ for url in self.urls:
74
+ transcript = None
75
+ if isinstance(url, dict):
76
+ path = list(url.keys())[0]
77
+ parts = url[path]
78
+ if isinstance(parts, str):
79
+ video_title = parts
80
+ elif isinstance(parts, dict):
81
+ video_title = parts['title']
82
+ transcript = parts.get('transcript', None)
83
+ url = path
84
+ else:
85
+ video_title = url
86
+ docs = self.load_video(url, video_title, transcript)
87
+ documents.extend(docs)
88
+ # return documents
89
+ return self.split_documents(documents)
90
+
91
+ @abstractmethod
92
+ def load_video(self, url: str, video_title: str, transcript: str) -> list:
93
+ pass
94
+
95
+ def parse(self, source):
96
+ pass
@@ -0,0 +1,120 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import PurePath
4
+ from langchain.docstore.document import Document
5
+ from .basevideo import BaseVideoLoader
6
+
7
+
8
+ class VideoLocalLoader(BaseVideoLoader):
9
+ """
10
+ Generating Video transcripts from local Videos.
11
+ """
12
+ _extension = ['.mp4', '.webm']
13
+
14
+ def __init__(
15
+ self,
16
+ path: PurePath,
17
+ tokenizer: Callable[..., Any] = None,
18
+ text_splitter: Callable[..., Any] = None,
19
+ source_type: str = 'documentation',
20
+ encoding: str = 'utf-8',
21
+ origin: str = '',
22
+ **kwargs
23
+ ):
24
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
25
+ self.path = path
26
+
27
+ def load_video(self, path: PurePath) -> list:
28
+ metadata = {
29
+ "source": f"{path}",
30
+ "url": f"{path.name}",
31
+ "index": path.stem,
32
+ "filename": f"{path}",
33
+ "question": '',
34
+ "answer": '',
35
+ 'type': 'video_transcript',
36
+ "source_type": self._source_type,
37
+ "summary": '',
38
+ "document_meta": {
39
+ "language": self._language,
40
+ "topic_tags": ""
41
+ }
42
+ }
43
+ documents = []
44
+ transcript_path = path.with_suffix('.vtt')
45
+ audio_path = path.with_suffix('.mp3')
46
+ # second: extract audio from File
47
+ self.extract_audio(path, audio_path)
48
+ # get the Whisper parser
49
+ transcript_whisper = self.get_whisper_transcript(audio_path)
50
+ if transcript_whisper:
51
+ transcript = transcript_whisper['text']
52
+ else:
53
+ transcript = ''
54
+ # Summarize the transcript
55
+ if transcript:
56
+ summary = self.get_summary_from_text(transcript)
57
+ # Create Two Documents, one is for transcript, second is VTT:
58
+ metadata['summary'] = summary
59
+ doc = Document(
60
+ page_content=transcript,
61
+ metadata=metadata
62
+ )
63
+ documents.append(doc)
64
+ if transcript_whisper:
65
+ # VTT version:
66
+ transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
67
+ doc = Document(
68
+ page_content=transcript,
69
+ metadata=metadata
70
+ )
71
+ documents.append(doc)
72
+ # Saving every dialog chunk as a separate document
73
+ dialogs = self.transcript_to_blocks(transcript_whisper)
74
+ docs = []
75
+ for chunk in dialogs:
76
+ _meta = {
77
+ "index": f"{path.stem}:{chunk['id']}",
78
+ "document_meta": {
79
+ "start": f"{chunk['start_time']}",
80
+ "end": f"{chunk['end_time']}",
81
+ "id": f"{chunk['id']}",
82
+ "language": self._language,
83
+ "title": f"{path.stem}",
84
+ "topic_tags": ""
85
+ }
86
+ }
87
+ _info = {**metadata, **_meta}
88
+ doc = Document(
89
+ page_content=chunk['text'],
90
+ metadata=_info
91
+ )
92
+ docs.append(doc)
93
+ documents.extend(docs)
94
+ return documents
95
+
96
+ def load(self) -> list:
97
+ documents = []
98
+ if self.path.is_file():
99
+ docs = self.load_video(self.path)
100
+ documents.extend(docs)
101
+ if self.path.is_dir():
102
+ # iterate over the files in the directory
103
+ for ext in self._extension:
104
+ for item in self.path.glob(f'*{ext}'):
105
+ if set(item.parts).isdisjoint(self.skip_directories):
106
+ documents.extend(self.load_video(item))
107
+ return self.split_documents(documents)
108
+
109
+ def extract(self) -> list:
110
+ documents = []
111
+ if self.path.is_file():
112
+ docs = self.load_video(self.path)
113
+ documents.extend(docs)
114
+ if self.path.is_dir():
115
+ # iterate over the files in the directory
116
+ for ext in self._extension:
117
+ for item in self.path.glob(f'*{ext}'):
118
+ if set(item.parts).isdisjoint(self.skip_directories):
119
+ documents.extend(self.load_video(item))
120
+ return documents