ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (108) hide show
  1. ai_parrot-0.1.0.dist-info/LICENSE +21 -0
  2. ai_parrot-0.1.0.dist-info/METADATA +299 -0
  3. ai_parrot-0.1.0.dist-info/RECORD +108 -0
  4. ai_parrot-0.1.0.dist-info/WHEEL +5 -0
  5. ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +18 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +965 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +257 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +100 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/oddie.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +515 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +108 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +169 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +0 -0
  40. parrot/llms/abstract.py +41 -0
  41. parrot/llms/anthropic.py +36 -0
  42. parrot/llms/google.py +37 -0
  43. parrot/llms/groq.py +33 -0
  44. parrot/llms/hf.py +39 -0
  45. parrot/llms/openai.py +49 -0
  46. parrot/llms/pipes.py +103 -0
  47. parrot/llms/vertex.py +68 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/basepdf.py +102 -0
  51. parrot/loaders/basevideo.py +280 -0
  52. parrot/loaders/csv.py +42 -0
  53. parrot/loaders/dir.py +37 -0
  54. parrot/loaders/excel.py +349 -0
  55. parrot/loaders/github.py +65 -0
  56. parrot/loaders/handlers/__init__.py +5 -0
  57. parrot/loaders/handlers/data.py +213 -0
  58. parrot/loaders/image.py +119 -0
  59. parrot/loaders/json.py +52 -0
  60. parrot/loaders/pdf.py +187 -0
  61. parrot/loaders/pdfchapters.py +142 -0
  62. parrot/loaders/pdffn.py +112 -0
  63. parrot/loaders/pdfimages.py +207 -0
  64. parrot/loaders/pdfmark.py +88 -0
  65. parrot/loaders/pdftables.py +145 -0
  66. parrot/loaders/ppt.py +30 -0
  67. parrot/loaders/qa.py +81 -0
  68. parrot/loaders/repo.py +103 -0
  69. parrot/loaders/rtd.py +65 -0
  70. parrot/loaders/txt.py +92 -0
  71. parrot/loaders/utils/__init__.py +1 -0
  72. parrot/loaders/utils/models.py +25 -0
  73. parrot/loaders/video.py +96 -0
  74. parrot/loaders/videolocal.py +107 -0
  75. parrot/loaders/vimeo.py +106 -0
  76. parrot/loaders/web.py +216 -0
  77. parrot/loaders/web_base.py +112 -0
  78. parrot/loaders/word.py +125 -0
  79. parrot/loaders/youtube.py +192 -0
  80. parrot/manager.py +152 -0
  81. parrot/models.py +347 -0
  82. parrot/py.typed +0 -0
  83. parrot/stores/__init__.py +0 -0
  84. parrot/stores/abstract.py +170 -0
  85. parrot/stores/milvus.py +540 -0
  86. parrot/stores/qdrant.py +153 -0
  87. parrot/tools/__init__.py +16 -0
  88. parrot/tools/abstract.py +53 -0
  89. parrot/tools/asknews.py +32 -0
  90. parrot/tools/bing.py +13 -0
  91. parrot/tools/duck.py +62 -0
  92. parrot/tools/google.py +170 -0
  93. parrot/tools/stack.py +26 -0
  94. parrot/tools/weather.py +70 -0
  95. parrot/tools/wikipedia.py +59 -0
  96. parrot/tools/zipcode.py +179 -0
  97. parrot/utils/__init__.py +2 -0
  98. parrot/utils/parsers/__init__.py +5 -0
  99. parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
  100. parrot/utils/toml.py +11 -0
  101. parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
  102. parrot/utils/uv.py +11 -0
  103. parrot/version.py +10 -0
  104. resources/users/__init__.py +5 -0
  105. resources/users/handlers.py +13 -0
  106. resources/users/models.py +205 -0
  107. settings/__init__.py +0 -0
  108. settings/settings.py +51 -0
parrot/loaders/rtd.py ADDED
@@ -0,0 +1,65 @@
1
+ from collections.abc import Callable
2
+ from pathlib import PurePath
3
+ from typing import Any
4
+ import re
5
+ from langchain_community.document_loaders import ReadTheDocsLoader as RTLoader
6
+ from .abstract import AbstractLoader
7
+
8
+
9
+ class ReadTheDocsLoader(AbstractLoader):
10
+ """
11
+ Loading documents from ReadTheDocs.
12
+ """
13
+ _extension: list = []
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'documentation',
21
+ encoding: str = 'utf-8',
22
+ origin: str = '',
23
+ **kwargs
24
+ ):
25
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
26
+ self.path = path
27
+ self.encoding = encoding
28
+ self.origin = origin
29
+ self._prefix = ''
30
+ match = re.search(r'://([^/]+)', origin)
31
+ if match:
32
+ self._prefix = match.group(1)
33
+
34
+ def load(self) -> list:
35
+ documents = []
36
+ self.logger.info(
37
+ f"Loading ReadTheDocs from: {self.path}"
38
+ )
39
+ rt_loader = RTLoader(path=self.path, encoding=self.encoding)
40
+ documents = rt_loader.load()
41
+ for doc in documents:
42
+ src = doc.metadata.get('source')
43
+ suffix = src.replace(f'{self.path}', '').replace(self._prefix, '')
44
+ if suffix.startswith('//'):
45
+ suffix = suffix[1:]
46
+ url = f"{self.origin}{suffix}"
47
+ metadata = {
48
+ "source": url,
49
+ "url": url,
50
+ "index": suffix,
51
+ "filename": src,
52
+ "question": '',
53
+ "answer": '',
54
+ 'type': 'documentation',
55
+ "source_type": self._source_type,
56
+ "summary": '',
57
+ "document_meta": {
58
+ **doc.metadata
59
+ }
60
+ }
61
+ doc.metadata = metadata
62
+ return documents
63
+
64
+ def parse(self, source):
65
+ pass
parrot/loaders/txt.py ADDED
@@ -0,0 +1,92 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain.docstore.document import Document
6
+ from .abstract import AbstractLoader
7
+
8
+
9
+ class TXTLoader(AbstractLoader):
10
+ """
11
+ Loader for PDF files.
12
+ """
13
+ _extension = ['.txt']
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'text',
21
+ **kwargs
22
+ ):
23
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
24
+ self.path = path
25
+ if isinstance(path, str):
26
+ self.path = Path(path).resolve()
27
+
28
+ def _load_document(self, path: Path) -> list:
29
+ """
30
+ Load a TXT file.
31
+
32
+ Args:
33
+ path (Path): The path to the TXT file.
34
+
35
+ Returns:
36
+ list: A list of Langchain Documents.
37
+ """
38
+ if self._check_path(path):
39
+ self.logger.info(f"Loading TXT file: {path}")
40
+ with open(path, 'r') as file:
41
+ text = file.read()
42
+ return [
43
+ Document(
44
+ page_content=text,
45
+ metadata={
46
+ "url": '',
47
+ "index": str(path.name),
48
+ "source": str(path),
49
+ "filename": str(path.name),
50
+ "summary": '',
51
+ "question": '',
52
+ "answer": '',
53
+ 'type': 'text',
54
+ "source_type": self._source_type,
55
+ "document_meta": {}
56
+ }
57
+ )
58
+ ]
59
+ return []
60
+
61
+ def load(self) -> list:
62
+ """
63
+ Load data from a TXT file.
64
+
65
+ Args:
66
+ source (str): The path to the TXT file.
67
+
68
+ Returns:
69
+ list: A list of Langchain Documents.
70
+ """
71
+ if not self.path.exists():
72
+ raise FileNotFoundError(
73
+ f"File/directory not found: {self.path}"
74
+ )
75
+ if self.path.is_dir():
76
+ documents = []
77
+ # iterate over the files in the directory
78
+ for ext in self._extension:
79
+ for item in self.path.glob(f'*{ext}'):
80
+ documents.extend(self._load_document(item))
81
+ elif self.path.is_file():
82
+ documents = self._load_document(self.path)
83
+ else:
84
+ raise ValueError(
85
+ f"TXT Loader: Invalid path: {self.path}"
86
+ )
87
+ return self.split_documents(documents)
88
+
89
+ def parse(self, source):
90
+ raise NotImplementedError(
91
+ "Parser method is not implemented for TXTLoader."
92
+ )
@@ -0,0 +1 @@
1
+ from .models import BotData
@@ -0,0 +1,25 @@
1
+ from typing import Optional, Union
2
+ from datetime import datetime
3
+ from pathlib import PurePath
4
+ from datamodel import BaseModel, Field
5
+
6
+ def uuid_to_str(obj) -> str:
7
+ return str(obj)
8
+
9
+ class BotData(BaseModel):
10
+ chatbot_id: str = Field(primary_key=True, required=True, encoder=uuid_to_str)
11
+ name: str = Field(required=True)
12
+ source_type: str = Field(required=True, default='content')
13
+ category: str = Field(required=True, default='data')
14
+ tags: Optional[list[str]] = Field(required=False, default_factory=list)
15
+ document_type: str = Field(required=False, default='document')
16
+ loader: str = Field(required=True, default='TXTLoader')
17
+ source_path: Union[str,PurePath] = Field(required=False)
18
+ extensions: list[str] = Field(required=False)
19
+ data: Optional[Union[list,dict]] = Field(required=False)
20
+ arguments: Optional[dict] = Field(default_factory=dict)
21
+ version: int = Field(required=True, default=1)
22
+ updated_at: datetime = Field(required=False, default=datetime.now)
23
+
24
+ class Meta:
25
+ name: str = 'chatbots_data'
@@ -0,0 +1,96 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Union, List
3
+ from abc import abstractmethod
4
+ from pathlib import Path
5
+ import subprocess
6
+ from .basevideo import BaseVideoLoader
7
+
8
+
9
+ class VideoLoader(BaseVideoLoader):
10
+ """
11
+ Generating Video transcripts from Videos.
12
+ """
13
+ _extension = ['.youtube']
14
+ encoding = 'utf-8'
15
+ chunk_size = 768
16
+
17
+ def __init__(
18
+ self,
19
+ urls: List[str],
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'video',
23
+ language: str = "en",
24
+ video_path: Union[str, Path] = None,
25
+ **kwargs
26
+ ):
27
+ super().__init__(
28
+ urls,
29
+ tokenizer,
30
+ text_splitter,
31
+ source_type,
32
+ language=language,
33
+ video_path=video_path,
34
+ **kwargs
35
+ )
36
+
37
+ def download_video(self, url: str, path: str) -> Path:
38
+ """
39
+ Downloads a video from a URL using yt-dlp.
40
+
41
+ Args:
42
+ video_url (str): The URL of the video to download.
43
+ output_path (str): The directory where the video will be saved.
44
+ """
45
+ command = [
46
+ "yt-dlp",
47
+ "--get-filename",
48
+ url
49
+ ]
50
+ try:
51
+ result = subprocess.run(command, check=True, stdout=subprocess.PIPE, text=True)
52
+ filename = result.stdout.strip() # Remove any trailing newline characters
53
+ file_path = path.joinpath(filename)
54
+ if file_path.exists():
55
+ print(f"Video already downloaded: {filename}")
56
+ return file_path
57
+ print(f"Downloading video: {filename}")
58
+ # after extracted filename, download the video
59
+ command = [
60
+ "yt-dlp",
61
+ url,
62
+ "-o",
63
+ str(file_path)
64
+ ]
65
+ subprocess.run(command, check=True)
66
+ return file_path
67
+ except subprocess.CalledProcessError as e:
68
+ print(f"Error downloading video: {e}")
69
+
70
+
71
+ def load(self) -> list:
72
+ documents = []
73
+ for url in self.urls:
74
+ transcript = None
75
+ if isinstance(url, dict):
76
+ path = list(url.keys())[0]
77
+ parts = url[path]
78
+ if isinstance(parts, str):
79
+ video_title = parts
80
+ elif isinstance(parts, dict):
81
+ video_title = parts['title']
82
+ transcript = parts.get('transcript', None)
83
+ url = path
84
+ else:
85
+ video_title = url
86
+ docs = self.load_video(url, video_title, transcript)
87
+ documents.extend(docs)
88
+ # return documents
89
+ return self.split_documents(documents)
90
+
91
+ @abstractmethod
92
+ def load_video(self, url: str, video_title: str, transcript: str) -> list:
93
+ pass
94
+
95
+ def parse(self, source):
96
+ pass
@@ -0,0 +1,107 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import PurePath
4
+ from langchain.docstore.document import Document
5
+ from .basevideo import BaseVideoLoader
6
+
7
+
8
+ class VideoLocalLoader(BaseVideoLoader):
9
+ """
10
+ Generating Video transcripts from local Videos.
11
+ """
12
+ _extension = ['.mp4', '.webm']
13
+
14
+ def __init__(
15
+ self,
16
+ path: PurePath,
17
+ tokenizer: Callable[..., Any] = None,
18
+ text_splitter: Callable[..., Any] = None,
19
+ source_type: str = 'documentation',
20
+ encoding: str = 'utf-8',
21
+ origin: str = '',
22
+ **kwargs
23
+ ):
24
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
25
+ self.path = path
26
+
27
+ def load_video(self, path: PurePath) -> list:
28
+ metadata = {
29
+ "source": f"{path}",
30
+ "url": f"{path.name}",
31
+ "index": path.stem,
32
+ "filename": f"{path}",
33
+ "question": '',
34
+ "answer": '',
35
+ 'type': 'video_transcript',
36
+ "source_type": self._source_type,
37
+ "summary": '',
38
+ "document_meta": {
39
+ "language": self._language,
40
+ "topic_tags": ""
41
+ }
42
+ }
43
+ documents = []
44
+ transcript_path = path.with_suffix('.vtt')
45
+ audio_path = path.with_suffix('.mp3')
46
+ # second: extract audio from File
47
+ self.extract_audio(path, audio_path)
48
+ # get the Whisper parser
49
+ transcript_whisper = self.get_whisper_transcript(audio_path)
50
+ if transcript_whisper:
51
+ transcript = transcript_whisper['text']
52
+ else:
53
+ transcript = ''
54
+ # Summarize the transcript
55
+ if transcript:
56
+ summary = self.get_summary_from_text(transcript)
57
+ # Create Two Documents, one is for transcript, second is VTT:
58
+ metadata['summary'] = summary
59
+ doc = Document(
60
+ page_content=transcript,
61
+ metadata=metadata
62
+ )
63
+ documents.append(doc)
64
+ if transcript_whisper:
65
+ # VTT version:
66
+ transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
67
+ doc = Document(
68
+ page_content=transcript,
69
+ metadata=metadata
70
+ )
71
+ documents.append(doc)
72
+ # Saving every dialog chunk as a separate document
73
+ dialogs = self.transcript_to_blocks(transcript_whisper)
74
+ docs = []
75
+ for chunk in dialogs:
76
+ _meta = {
77
+ "index": f"{path.stem}:{chunk['id']}",
78
+ "document_meta": {
79
+ "start": f"{chunk['start_time']}",
80
+ "end": f"{chunk['end_time']}",
81
+ "id": f"{chunk['id']}",
82
+ "language": self._language,
83
+ "title": f"{path.stem}",
84
+ "topic_tags": ""
85
+ }
86
+ }
87
+ _info = {**metadata, **_meta}
88
+ doc = Document(
89
+ page_content=chunk['text'],
90
+ metadata=_info
91
+ )
92
+ docs.append(doc)
93
+ documents.extend(docs)
94
+ return documents
95
+
96
+ def load(self) -> list:
97
+ documents = []
98
+ if self.path.is_file():
99
+ docs = self.load_video(self.path)
100
+ documents.extend(docs)
101
+ if self.path.is_dir():
102
+ # iterate over the files in the directory
103
+ for ext in self._extension:
104
+ for item in self.path.glob(f'*{ext}'):
105
+ if set(item.parts).isdisjoint(self.skip_directories):
106
+ documents.extend(self.load_video(item))
107
+ return self.split_documents(documents)
@@ -0,0 +1,106 @@
1
+ from typing import Optional, Union
2
+ from transformers import pipeline
3
+ import torch
4
+ from langchain.docstore.document import Document
5
+ from .youtube import YoutubeLoader
6
+
7
+
8
+ class VimeoLoader(YoutubeLoader):
9
+ """
10
+ Loader for Vimeo videos.
11
+ """
12
+ def load_video(self, url: str, video_title: str, transcript: Optional[Union[str, None]] = None) -> list:
13
+ metadata = {
14
+ "source": url,
15
+ "url": url,
16
+ "index": url,
17
+ "filename": video_title,
18
+ "question": '',
19
+ "answer": '',
20
+ 'type': 'video_transcript',
21
+ "source_type": self._source_type,
22
+ "summary": '',
23
+ "document_meta": {
24
+ "language": self._language,
25
+ "title": video_title,
26
+ "topic_tags": ""
27
+ }
28
+ }
29
+ if self.topics:
30
+ metadata['document_meta']['topic_tags'] = self.topics
31
+ if transcript is None:
32
+ documents = []
33
+ docs = []
34
+ # first: download video
35
+ try:
36
+ file_path = self.download_video(url, self._video_path)
37
+ except Exception:
38
+ return []
39
+ if not file_path:
40
+ self.logger.warning(
41
+ f"Error downloading File for video: {self._video_path}"
42
+ )
43
+ return []
44
+ transcript_path = file_path.with_suffix('.vtt')
45
+ audio_path = file_path.with_suffix('.mp3')
46
+ # second: extract audio
47
+ self.extract_audio(file_path, audio_path)
48
+ # get the Whisper parser
49
+ transcript_whisper = self.get_whisper_transcript(audio_path)
50
+ if transcript_whisper:
51
+ transcript = transcript_whisper['text']
52
+ else:
53
+ transcript = ''
54
+ # Summarize the transcript
55
+ if transcript:
56
+ summary = self.get_summary_from_text(transcript)
57
+ # Create Two Documents, one is for transcript, second is VTT:
58
+ metadata['summary'] = summary
59
+ doc = Document(
60
+ page_content=transcript,
61
+ metadata=metadata
62
+ )
63
+ documents.append(doc)
64
+ if transcript_whisper:
65
+ # VTT version:
66
+ transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
67
+ doc = Document(
68
+ page_content=transcript,
69
+ metadata=metadata
70
+ )
71
+ documents.append(doc)
72
+ # Saving every dialog chunk as a separate document
73
+ dialogs = self.transcript_to_blocks(transcript_whisper)
74
+ for chunk in dialogs:
75
+ _meta = {
76
+ "index": f"{video_title}:{chunk['id']}",
77
+ "document_meta": {
78
+ "start": f"{chunk['start_time']}",
79
+ "end": f"{chunk['end_time']}",
80
+ "id": f"{chunk['id']}",
81
+ "language": self._language,
82
+ "title": video_title,
83
+ "topic_tags": ""
84
+ }
85
+ }
86
+ _info = {**metadata, **_meta}
87
+ doc = Document(
88
+ page_content=chunk['text'],
89
+ metadata=_info
90
+ )
91
+ docs.append(doc)
92
+ documents.extend(docs)
93
+ return self.split_documents(documents)
94
+ else:
95
+ # using the transcript file
96
+ with open(transcript, 'r') as f:
97
+ transcript = f.read()
98
+ summary = self.get_summary_from_text(transcript)
99
+ transcript_whisper = None
100
+ metadata['summary'] = f"{summary!s}"
101
+ # Create Two Documents, one is for transcript, second is VTT:
102
+ doc = Document(
103
+ page_content=transcript,
104
+ metadata=metadata
105
+ )
106
+ return self.split_documents([doc])