ai-parrot 0.3.11__cp311-cp311-manylinux_2_28_x86_64.whl → 0.3.16__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ai-parrot
3
- Version: 0.3.11
3
+ Version: 0.3.16
4
4
  Summary: Live Chatbots based on Langchain chatbots and Agents Integrated into Navigator Framework or used into aiohttp applications.
5
5
  Home-page: https://github.com/phenobarbital/ai-parrot
6
6
  Author: Jesus Lara
@@ -88,6 +88,22 @@ Requires-Dist: streamlit==1.37.1; extra == "analytics"
88
88
  Provides-Extra: anthropic
89
89
  Requires-Dist: langchain-anthropic==0.1.11; extra == "anthropic"
90
90
  Requires-Dist: anthropic==0.25.2; extra == "anthropic"
91
+ Provides-Extra: basic_loaders
92
+ Requires-Dist: youtube-transcript-api==0.6.2; extra == "basic-loaders"
93
+ Requires-Dist: pymupdf==1.24.4; extra == "basic-loaders"
94
+ Requires-Dist: pymupdf4llm==0.0.1; extra == "basic-loaders"
95
+ Requires-Dist: pdf4llm==0.0.6; extra == "basic-loaders"
96
+ Requires-Dist: pytube==15.0.0; extra == "basic-loaders"
97
+ Requires-Dist: pydub==0.25.1; extra == "basic-loaders"
98
+ Requires-Dist: markdownify==0.12.1; extra == "basic-loaders"
99
+ Requires-Dist: yt-dlp==2024.4.9; extra == "basic-loaders"
100
+ Requires-Dist: moviepy==1.0.3; extra == "basic-loaders"
101
+ Requires-Dist: rapidocr-onnxruntime==1.3.15; extra == "basic-loaders"
102
+ Requires-Dist: pytesseract==0.3.10; extra == "basic-loaders"
103
+ Requires-Dist: python-docx==1.1.0; extra == "basic-loaders"
104
+ Requires-Dist: python-pptx==0.6.23; extra == "basic-loaders"
105
+ Requires-Dist: docx2txt==0.8; extra == "basic-loaders"
106
+ Requires-Dist: mammoth==1.7.1; extra == "basic-loaders"
91
107
  Provides-Extra: crew
92
108
  Requires-Dist: colbert-ai==0.2.19; extra == "crew"
93
109
  Requires-Dist: vanna==0.3.4; extra == "crew"
@@ -104,26 +120,11 @@ Requires-Dist: llama-index-llms-huggingface==0.2.7; extra == "hunggingfaces"
104
120
  Provides-Extra: loaders
105
121
  Requires-Dist: unstructured==0.14.3; extra == "loaders"
106
122
  Requires-Dist: unstructured-client==0.18.0; extra == "loaders"
107
- Requires-Dist: youtube-transcript-api==0.6.2; extra == "loaders"
108
- Requires-Dist: pymupdf==1.24.4; extra == "loaders"
109
- Requires-Dist: pymupdf4llm==0.0.1; extra == "loaders"
110
- Requires-Dist: pdf4llm==0.0.6; extra == "loaders"
111
123
  Requires-Dist: PyPDF2==3.0.1; extra == "loaders"
112
124
  Requires-Dist: pdfminer.six==20231228; extra == "loaders"
113
125
  Requires-Dist: pdfplumber==0.11.0; extra == "loaders"
114
126
  Requires-Dist: GitPython==3.1.42; extra == "loaders"
115
127
  Requires-Dist: opentelemetry-sdk==1.24.0; extra == "loaders"
116
- Requires-Dist: rapidocr-onnxruntime==1.3.15; extra == "loaders"
117
- Requires-Dist: pytesseract==0.3.10; extra == "loaders"
118
- Requires-Dist: python-docx==1.1.0; extra == "loaders"
119
- Requires-Dist: python-pptx==0.6.23; extra == "loaders"
120
- Requires-Dist: docx2txt==0.8; extra == "loaders"
121
- Requires-Dist: pytube==15.0.0; extra == "loaders"
122
- Requires-Dist: pydub==0.25.1; extra == "loaders"
123
- Requires-Dist: markdownify==0.12.1; extra == "loaders"
124
- Requires-Dist: yt-dlp==2024.4.9; extra == "loaders"
125
- Requires-Dist: moviepy==1.0.3; extra == "loaders"
126
- Requires-Dist: mammoth==1.7.1; extra == "loaders"
127
128
  Requires-Dist: paddlepaddle==2.6.1; extra == "loaders"
128
129
  Requires-Dist: paddlepaddle-gpu==2.6.1; extra == "loaders"
129
130
  Requires-Dist: paddleocr==2.8.1; extra == "loaders"
@@ -1,18 +1,18 @@
1
1
  parrot/__init__.py,sha256=eTkAkHeJ5BBDG2fxrXA4M37ODBJoS1DQYpeBAWL2xeI,387
2
- parrot/conf.py,sha256=-9bVGC7Rf-6wpIg6-ojvU4S_G1wBLUCVDt46KEGHEhM,4257
2
+ parrot/conf.py,sha256=andrPREuR_BHiXA_Q0Utyb5xSb1ct_uKnjMzEOa1ftE,4373
3
3
  parrot/exceptions.cpython-311-x86_64-linux-gnu.so,sha256=VNyBh3uLxGQgB0l1bkWjQDqYUN2ZAvRmV12AqQijV9Q,361184
4
4
  parrot/manager.py,sha256=NhzXoWxSgtoWHpmYP8cV2Ujq_SlvCbQYQBaohAeL2TM,5935
5
5
  parrot/models.py,sha256=RsVQCqhSXBKRPcu-BCga9Y1wyvENFXDCuq3_ObIKvAo,13452
6
6
  parrot/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- parrot/version.py,sha256=iXWwg_05dEqR8n13PRWz5QqlmSq6_og3HT3moP37RAg,374
7
+ parrot/version.py,sha256=gt_HAcyWEN5lqHTmPKg0Nn-BgX6aq3F6Ah64q2Dfc7s,374
8
8
  parrot/chatbots/__init__.py,sha256=ypskCnME0xUv6psBEGCEyXCrD0J0ULHSllpVmSxqb4A,200
9
9
  parrot/chatbots/abstract.py,sha256=CmDn3k4r9uKImOZRN4L9zxLbCdC-1MPUAorDlfZT-kA,26421
10
10
  parrot/chatbots/asktroc.py,sha256=gyWzyvpAnmXwXd-3DEKoIJtAxt6NnP5mUZdZbkFky8s,604
11
- parrot/chatbots/base.py,sha256=cRw7k5FRKOfLdXQJeQvACVE5ZgE1NUWf3IY7OsSsxuo,12912
11
+ parrot/chatbots/base.py,sha256=5QX5-VPAOM-I8o0ktBt3_JEDPRQ_-iK0fFLRMUlvs_s,13396
12
12
  parrot/chatbots/basic.py,sha256=DIMTPoGc90BRSlokeOdnjlEXAAfZlIFqxXWaMyAX9uk,232
13
13
  parrot/chatbots/bose.py,sha256=z8rm8G_tAwHjDUodXfrAKnhaMzufQyf-GrhxwHeHle4,757
14
14
  parrot/chatbots/cody.py,sha256=Z0LNiNtZjEe7bA3hwexclBZK5zEF9m2ODVmrzZjC3Bw,623
15
- parrot/chatbots/copilot.py,sha256=JTnc-fdszwZ2nLmpNu-tVe6Al8z9PNIYHxv8fd42YQU,2051
15
+ parrot/chatbots/copilot.py,sha256=Q_CwoPm1M0loa7N3DLSLK8eq4m99z1CeU5FI9iqF9XI,2767
16
16
  parrot/chatbots/dataframe.py,sha256=CfZiLKIwnaku52nl2PNjciqRlH8m2lM4buO6xI7P408,3914
17
17
  parrot/chatbots/hragents.py,sha256=PyNIBJ2OH5CtfVydccgpY50V6GI3cLKuVdOMaa7sQz0,574
18
18
  parrot/chatbots/oddie.py,sha256=RMbANmJZP1_vLVGKRNBKmA8otyAiWPkvpA0rJ0U3tZk,796
@@ -45,11 +45,11 @@ parrot/llms/hf.py,sha256=f2HhHCICaSHp0y3KRhqNcYXNO-amYTxDXJ_2_9L5Bk8,1594
45
45
  parrot/llms/openai.py,sha256=NgWv6IwJ1DborlYhTyureBBdgHfAPc_lGHQRGt80ca8,1759
46
46
  parrot/llms/pipes.py,sha256=Ns_wh-alkKocZKlbQyQLKOSBxqfRC_hCbz34vpOOyP8,3798
47
47
  parrot/llms/vertex.py,sha256=a0UsH9sa_GiMkg31E52cWE8pXFZjyMtIanr7eAA7iyE,2615
48
- parrot/loaders/__init__.py,sha256=LGEaj54DP3FA5-C2IDaA8u-MF4lj-Lbd_Mx5R19qHYY,665
49
- parrot/loaders/abstract.py,sha256=_tsGDb7TracwkL20J2VYd5hC9MR262c2mmS9VvYB4vM,15870
48
+ parrot/loaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
+ parrot/loaders/abstract.py,sha256=Mx6BtihuwvFkl-Ali_84949BfVXFB0JZjmSKnJ-gXSg,17272
50
50
  parrot/loaders/audio.py,sha256=P2tWYKxWLM5TMLMm-5qR35cD_pGQWmf8-UZUTiK4R0o,3698
51
51
  parrot/loaders/basepdf.py,sha256=Qh_hzR0JArQEVP31SgWt9utt7qWmbfwVoCzUDyBHcXw,3243
52
- parrot/loaders/basevideo.py,sha256=WcX-q0Rn_E1dYurbA1eH5NOcUBdOye2iWiFTCY_DVgo,10292
52
+ parrot/loaders/basevideo.py,sha256=xLAMfIhjGR10T3-Pdx8MLq5Bp6woCOuq5Jr6yUZ6LCU,11992
53
53
  parrot/loaders/csv.py,sha256=DLcFK3z9boMNH3y9Qca5BWDfYXgXjXsGkzxVN1_2wyo,1103
54
54
  parrot/loaders/dir.py,sha256=_CU9kWGCpHnZplUamXLs2yEizA1aCRBASn3F6MggitQ,866
55
55
  parrot/loaders/excel.py,sha256=9cTsMfxR_YOpBHz9Ru0LJsxBXDVBh52XM8hHV63QgYo,12445
@@ -67,13 +67,13 @@ parrot/loaders/qa.py,sha256=3K_2yBxUzj-ifDpAbUsIc-v66004fKPzGavUqrhc3Kc,2646
67
67
  parrot/loaders/repo.py,sha256=vBqBAnwU6p3_DCvI9DVhi1Bs8iCDYHwFGp0P9zvGRyw,3737
68
68
  parrot/loaders/rtd.py,sha256=O0h7LDntP_0IBT8LDQi09u-gYVUO5cuvmGsfZLZ4CoU,1990
69
69
  parrot/loaders/txt.py,sha256=-xXVSuvkC2LQ2XZ44Nqwk3V8nE4F6UgXylosMCNgeFo,2804
70
- parrot/loaders/video.py,sha256=pl5Ho69bp5vrWMqg5tLbsnHUus1LByTDoL6NPk57Ays,2929
71
- parrot/loaders/videolocal.py,sha256=NwFB6n9sQZxh01L6YKIISpG1tuRsg-ME_qXCDS7Vtkk,5143
70
+ parrot/loaders/video.py,sha256=9zKUFFROSIbWjWFOvxDrW4uOewrMzD7-xADmszOpP4k,2930
71
+ parrot/loaders/videolocal.py,sha256=cRYv3KvKKHltMY4QbnvEMCOLHlEY9ZmWeXTL23fy-gA,9669
72
72
  parrot/loaders/vimeo.py,sha256=Cs7FkL2Cr8yV44-Tv5wWkveKzqhOeAIP6kF93SCr_Lk,4118
73
73
  parrot/loaders/web.py,sha256=kTi-NtAsbQLKi3wD_2o15Z0HHnYzsEEEGjH0RdvyQqQ,8869
74
74
  parrot/loaders/web_base.py,sha256=ZwSFXtJR71cpFGN1WCLUC2W6JjEUV865tRKf8isbJ5M,4382
75
75
  parrot/loaders/word.py,sha256=jZdHSL5CtAEn1otBYLNSqKLtO3BNcTObDPgqhzk5-4M,4533
76
- parrot/loaders/youtube.py,sha256=fVnBBw4IfK6NWP7mO66TgxOzJEcGwE3-3S1WMUApJYg,7751
76
+ parrot/loaders/youtube.py,sha256=DzH9bD5ZrLaTG_6GMjHsy1cHoTBR712yUC8tJiAYbNM,9607
77
77
  parrot/loaders/handlers/__init__.py,sha256=ksEDtUOEJELmyCIi0KNv7tR2fCUyADBVkwCcyqN_sVE,70
78
78
  parrot/loaders/handlers/data.py,sha256=olZ2p-wyUMGoazah7tgHY7V9buGX1FOeJ-cv2vGEoH8,7386
79
79
  parrot/loaders/utils/__init__.py,sha256=SkDyK3MuPGhp0NM6kHvaxQDe97Gcl3n9t5A741OVh1c,28
@@ -87,6 +87,7 @@ parrot/tools/abstract.py,sha256=pVSZw8MDpbVcQ-CHaGwP6CpqXHIs8hH8Oy1AqUuMmrw,1706
87
87
  parrot/tools/asknews.py,sha256=hEpPJMyNBVfj2maHbqnumn3VkY45oFvrjkE3Rq8EdGA,1039
88
88
  parrot/tools/bing.py,sha256=BtmFD66OIuCaOue5U2_yIqtjWf24IhEgNOX1LAVvHtA,464
89
89
  parrot/tools/duck.py,sha256=UAAZzlF-Q0sZh0_IcS96dwSgCuBPdeepkwRrMM5cJPY,1920
90
+ parrot/tools/execute.py,sha256=fTMQAsXuUzVyIWmZxL22LrSj2eQ-Rh-ncyUZ9gY-d-A,1687
90
91
  parrot/tools/google.py,sha256=NjijcUWH6Crk5Uty_x3FstjDTGZV8JXfBFDQEtMHhac,6236
91
92
  parrot/tools/stack.py,sha256=M-VRWjIDa18bl5p88dSKtxMj4Kn21YB76to0u6yXA30,942
92
93
  parrot/tools/weather.py,sha256=4v9Ft5lkVzb9Pg7afNs7BK5T3WEcsZbHPlBrF9oXSo8,2541
@@ -103,8 +104,8 @@ resources/users/handlers.py,sha256=BGzqBvPY_OaIF_nONWX4b_B5OyyBrdGuSihIsdlFwjk,2
103
104
  resources/users/models.py,sha256=glk7Emv7QCi6i32xRFDrGc8UwK23_LPg0XUOJoHnwRU,6799
104
105
  settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
106
  settings/settings.py,sha256=9ueEvyLNurUX-AaIeRPV8GKX1c4YjDLbksUAeqEq6Ck,1854
106
- ai_parrot-0.3.11.dist-info/LICENSE,sha256=vRKOoa7onTsLNvSzJtGtMaNhWWh8B3YAT733Tlu6M4o,1070
107
- ai_parrot-0.3.11.dist-info/METADATA,sha256=zOYFernNOneZdQH-EIXCsmT83ATrzxGA4x9WmOhciX8,9838
108
- ai_parrot-0.3.11.dist-info/WHEEL,sha256=UQ-0qXN3LQUffjrV43_e_ZXj2pgORBqTmXipnkj0E8I,113
109
- ai_parrot-0.3.11.dist-info/top_level.txt,sha256=qHoO4BhYDfeTkyKnciZSQtn5FSLN3Q-P5xCTkyvbuxg,26
110
- ai_parrot-0.3.11.dist-info/RECORD,,
107
+ ai_parrot-0.3.16.dist-info/LICENSE,sha256=vRKOoa7onTsLNvSzJtGtMaNhWWh8B3YAT733Tlu6M4o,1070
108
+ ai_parrot-0.3.16.dist-info/METADATA,sha256=wD3EGhVYnjrQtP_j4853HjspuqFjDQvJB_-dvnBTK_A,9958
109
+ ai_parrot-0.3.16.dist-info/WHEEL,sha256=UQ-0qXN3LQUffjrV43_e_ZXj2pgORBqTmXipnkj0E8I,113
110
+ ai_parrot-0.3.16.dist-info/top_level.txt,sha256=qHoO4BhYDfeTkyKnciZSQtn5FSLN3Q-P5xCTkyvbuxg,26
111
+ ai_parrot-0.3.16.dist-info/RECORD,,
parrot/chatbots/base.py CHANGED
@@ -89,6 +89,9 @@ Whether you need help with a specific question or just want to have a conversati
89
89
  - OpenWeatherMap: Get weather information about a location.
90
90
  - yahoo_finance_news: Retrieve the latest financial news from Yahoo Finance.
91
91
  - python_repl_ast: A Python shell. Use this to execute python commands. Input should be a valid python command. When using this tool, sometimes output is abbreviated - make sure it does not look abbreviated before using it in your answer.
92
+ - executable_python_repl_ast: A Python shell. Use this to execute python commands. Input should be a valid python command. When using this tool, whenever you generate a visual output (like charts with matplotlib), instead of using plt.show(), render the image as a base64-encoded HTML string. Do this by saving the plot to a buffer and encoding it in base64, then return the result as a JSON object formatted as follows: "image": "format": "png", "base64": "base64-encoded-string".
93
+
94
+
92
95
  - youtube_search: Search for videos on YouTube based on specific keywords.
93
96
 
94
97
 
@@ -4,17 +4,43 @@ from .base import BaseAgent
4
4
  from ..tools import (
5
5
  ZipcodeAPIToolkit,
6
6
  WikipediaTool,
7
- WikidataTool,
7
+ # WikidataTool,
8
8
  GoogleSearchTool,
9
9
  GoogleLocationFinder,
10
10
  BingSearchTool,
11
- AskNewsTool,
11
+ # AskNewsTool,
12
12
  DuckDuckGoSearchTool,
13
13
  YouTubeSearchTool,
14
14
  OpenWeatherMapTool,
15
15
  StackExchangeTool,
16
16
  )
17
+ from ..tools.execute import ExecutablePythonREPLTool
17
18
 
19
+ # ZipCode API Toolkit
20
+ zpt = ZipcodeAPIToolkit()
21
+ zpt_tools = zpt.get_tools()
22
+
23
+ wk1 = WikipediaTool()
24
+ # wk12 = WikidataTool()
25
+
26
+ g1 = GoogleSearchTool()
27
+ g2 = GoogleLocationFinder()
28
+
29
+ b = BingSearchTool()
30
+ d = DuckDuckGoSearchTool()
31
+ # ask = AskNewsTool()
32
+
33
+ yt = YouTubeSearchTool()
34
+ stackexchange = StackExchangeTool()
35
+ weather = OpenWeatherMapTool()
36
+
37
+ tooling = [
38
+ wk1,
39
+ g1, g2,
40
+ b, d, yt,
41
+ weather,
42
+ stackexchange
43
+ ] + zpt_tools
18
44
 
19
45
  class CopilotAgent(BaseAgent):
20
46
  """CopilotAgent Agent.
@@ -30,17 +56,24 @@ class CopilotAgent(BaseAgent):
30
56
  **kwargs
31
57
  ):
32
58
  super().__init__(name, llm, tools, prompt_template, **kwargs)
59
+ if not tools:
60
+ tools = tooling
33
61
  self.tools = [
34
62
  PythonAstREPLTool(
35
63
  name='python_repl_ast',
36
64
  globals={},
37
65
  locals={}
66
+ ),
67
+ ExecutablePythonREPLTool(
68
+ name='executable_python_repl_ast',
69
+ globals={},
70
+ locals={}
38
71
  )
39
72
  ] + list(tools)
40
73
  self.prompt = self.get_prompt(
41
74
  self.prompt_template
42
75
  )
43
- # print('PROMPT > ', self.prompt)
76
+ print('PROMPT > ', self.prompt)
44
77
 
45
78
  @classmethod
46
79
  def default_tools(cls) -> list:
parrot/conf.py CHANGED
@@ -12,6 +12,8 @@ logging.getLogger(name='h5py').setLevel(logging.INFO)
12
12
  logging.getLogger(name='tensorflow').setLevel(logging.INFO)
13
13
  logging.getLogger(name='selenium.webdriver').setLevel(logging.WARNING)
14
14
  logging.getLogger(name='selenium').setLevel(logging.INFO)
15
+ logging.getLogger(name='matplotlib').setLevel(logging.WARNING)
16
+ logging.getLogger(name='PIL').setLevel(logging.INFO)
15
17
 
16
18
 
17
19
  # Static directory
@@ -1,20 +0,0 @@
1
- from .dir import load_directory
2
- from .pdf import PDFLoader
3
- from .web import WebLoader
4
- from .youtube import YoutubeLoader
5
- from .vimeo import VimeoLoader
6
- from .word import MSWordLoader
7
- from .ppt import PPTXLoader
8
- from .repo import RepositoryLoader
9
- from .github import GithubLoader
10
- from .json import JSONLoader
11
- from .excel import ExcelLoader
12
- from .web_base import WebBaseLoader
13
- from .pdfmark import PDFMarkdownLoader
14
- from .pdfimages import PDFImageLoader
15
- from .pdftables import PDFTablesLoader
16
- from .pdfchapters import PDFChapterLoader
17
- from .txt import TXTLoader
18
- from .qa import QAFileLoader
19
- from .rtd import ReadTheDocsLoader
20
- from .videolocal import VideoLocalLoader
@@ -74,6 +74,7 @@ class AbstractLoader(ABC):
74
74
  self,
75
75
  tokenizer: Union[str, Callable] = None,
76
76
  text_splitter: Union[str, Callable] = None,
77
+ translation: Optional[str] = None,
77
78
  source_type: str = 'file',
78
79
  **kwargs
79
80
  ):
@@ -114,6 +115,15 @@ class AbstractLoader(ABC):
114
115
  )
115
116
  # JSON encoder:
116
117
  self._encoder = JSONContent()
118
+ # Traslation
119
+ self._translation = translation
120
+ self.translator = None
121
+ if self._translation:
122
+ mdl = kwargs.get(
123
+ 'translation_model',
124
+ f"Helsinki-NLP/opus-mt-en-{self._translation}"
125
+ )
126
+ self.translator = self.get_translator(mdl)
117
127
 
118
128
 
119
129
  def __enter__(self):
@@ -159,6 +169,27 @@ class AbstractLoader(ABC):
159
169
  use_memory_efficient_attention=True,
160
170
  ).to(self._device)
161
171
 
172
+ def get_translator(self, model_name: str = 'Helsinki-NLP/opus-mt-en-es'):
173
+ if not self._translation:
174
+ return None
175
+ trans_model = AutoModelForSeq2SeqLM.from_pretrained(
176
+ model_name,
177
+ device_map="auto",
178
+ torch_dtype=torch.bfloat16,
179
+ trust_remote_code=True
180
+ )
181
+ trans_tokenizer = AutoTokenizer.from_pretrained(model_name)
182
+ translator = pipeline(
183
+ "translation",
184
+ model=trans_model,
185
+ tokenizer=trans_tokenizer,
186
+ batch_size=True,
187
+ max_new_tokens=500,
188
+ min_new_tokens=300,
189
+ use_fast=True
190
+ )
191
+ return translator
192
+
162
193
  def get_summarization_model(self, model_name: str = 'facebook/bart-large-cnn'):
163
194
  if self._no_summarization is True:
164
195
  return None
@@ -216,7 +247,7 @@ class AbstractLoader(ABC):
216
247
  return ''
217
248
  try:
218
249
  splitter = TokenTextSplitter(
219
- chunk_size=5000,
250
+ chunk_size=6144,
220
251
  chunk_overlap=100,
221
252
  )
222
253
  prompt_template = """Write a summary of the following, please also identify the main theme:
@@ -454,3 +485,15 @@ class AbstractLoader(ABC):
454
485
  for url in urls:
455
486
  documents += cls.load(url, **kwargs)
456
487
  return documents
488
+
489
+ def saving_file(self, filename: PurePath, data: Any):
490
+ """Save data to a file.
491
+
492
+ Args:
493
+ filename (PurePath): The path to the file.
494
+ data (Any): The data to save.
495
+ """
496
+ with open(filename, 'wb') as f:
497
+ f.write(data)
498
+ f.flush()
499
+ print(f':: Saved File on {filename}')
@@ -1,8 +1,9 @@
1
1
  from collections.abc import Callable
2
- from typing import Any, Union, List
2
+ from typing import Any, Union, List, Optional
3
3
  from abc import abstractmethod
4
4
  from pathlib import Path
5
5
  from moviepy.editor import VideoFileClip
6
+ from pydub import AudioSegment
6
7
  from transformers import (
7
8
  pipeline,
8
9
  AutoModelForSeq2SeqLM,
@@ -193,25 +194,69 @@ class BaseVideoLoader(AbstractLoader):
193
194
  print('ERROR in summarization:', e)
194
195
  return ""
195
196
 
196
- def extract_audio(self, video_path, audio_path):
197
+ def extract_audio(
198
+ self,
199
+ video_path: Path,
200
+ audio_path: Path,
201
+ compress_speed: bool = False,
202
+ output_path: Optional[Path] = None,
203
+ speed_factor: float = 1.5
204
+ ):
197
205
  """
198
- Extracts the audio from a video file and saves it as an audio file.
206
+ Extracts the audio from a video file and optionally compresses the audio speed.
199
207
 
200
208
  Args:
201
209
  video_path (str): Path to the video file.
202
210
  audio_path (str): Path where the extracted audio file will be saved.
211
+ compress_speed (bool): Whether to compress the audio speed.
212
+ speed_factor (float): The factor by which to speed up the audio.
203
213
  """
214
+ # Ensure that the paths are valid Path objects
215
+ video_path = Path(video_path)
216
+ audio_path = Path(audio_path)
217
+
218
+ # Check if the audio file already exists
204
219
  if audio_path.exists():
205
220
  print(f"Audio already extracted: {audio_path}")
206
221
  return
222
+
223
+ # Load the video and extract the audio
207
224
  video_clip = VideoFileClip(str(video_path))
208
225
  audio_clip = video_clip.audio
209
226
  if not audio_clip:
227
+ print("No audio found in video.")
210
228
  return
229
+
230
+ # Write the extracted audio to the specified path
231
+ print(f"Extracting audio to: {audio_path}")
211
232
  audio_clip.write_audiofile(str(audio_path))
212
233
  audio_clip.close()
213
234
  video_clip.close()
214
235
 
236
+ # Optionally compress the audio speed
237
+ if compress_speed:
238
+ print(f"Compressing audio speed by factor: {speed_factor}")
239
+
240
+ # Load the audio file with pydub
241
+ audio = AudioSegment.from_file(audio_path)
242
+
243
+ # Adjust the playback speed by modifying the frame rate
244
+ sped_up_audio = audio._spawn(audio.raw_data, overrides={
245
+ "frame_rate": int(audio.frame_rate * speed_factor)
246
+ })
247
+
248
+ # Restore the original frame rate to maintain proper playback speed
249
+ sped_up_audio = sped_up_audio.set_frame_rate(audio.frame_rate)
250
+
251
+ # Overwrite the original file with the sped-up version
252
+ if not output_path:
253
+ output_path = audio_path
254
+ sped_up_audio.export(output_path, format="mp3")
255
+ print(f"Compressed audio saved to: {audio_path}")
256
+ else:
257
+ print(f"Audio extracted: {audio_path}")
258
+
259
+
215
260
  def get_whisper_transcript(self, audio_path: Path, chunk_length: int = 30):
216
261
  # Initialize the Whisper parser
217
262
  if self._model_name == 'whisper':
parrot/loaders/video.py CHANGED
@@ -12,7 +12,7 @@ class VideoLoader(BaseVideoLoader):
12
12
  """
13
13
  _extension = ['.youtube']
14
14
  encoding = 'utf-8'
15
- chunk_size = 768
15
+ chunk_size = 2048
16
16
 
17
17
  def __init__(
18
18
  self,
@@ -1,5 +1,6 @@
1
1
  from typing import Any
2
2
  from collections.abc import Callable
3
+ import re
3
4
  import math
4
5
  from pathlib import PurePath
5
6
  from langchain.docstore.document import Document
@@ -8,16 +9,35 @@ from .basevideo import BaseVideoLoader
8
9
 
9
10
  def split_text(text, max_length):
10
11
  """Split text into chunks of a maximum length, ensuring not to break words."""
12
+ # Split the transcript into paragraphs
13
+ paragraphs = text.split('\n\n')
11
14
  chunks = []
12
- while len(text) > max_length:
13
- # Find the last space before the max_length
14
- split_point = text.rfind(' ', 0, max_length)
15
- # If no space found, split at max_length
16
- if split_point == -1:
17
- split_point = max_length
18
- chunks.append(text[:split_point])
19
- text = text[split_point:].strip()
20
- chunks.append(text)
15
+ current_chunk = ""
16
+ for paragraph in paragraphs:
17
+ # If the paragraph is too large, split it into sentences
18
+ if len(paragraph) > max_length:
19
+ # Split paragraph into sentences
20
+ sentences = re.split(r'(?<=[.!?]) +', paragraph)
21
+ for sentence in sentences:
22
+ if len(current_chunk) + len(sentence) + 1 > max_length:
23
+ # Save the current chunk and start a new one
24
+ chunks.append(current_chunk.strip())
25
+ current_chunk = sentence
26
+ else:
27
+ # Add sentence to the current chunk
28
+ current_chunk += " " + sentence
29
+ else:
30
+ # If adding the paragraph exceeds max size, start a new chunk
31
+ if len(current_chunk) + len(paragraph) + 2 > max_length:
32
+ chunks.append(current_chunk.strip())
33
+ current_chunk = paragraph
34
+ else:
35
+ # Add paragraph to the current chunk
36
+ current_chunk += "\n\n" + paragraph
37
+ # Add any remaining text to chunks
38
+ if current_chunk.strip():
39
+ chunks.append(current_chunk.strip())
40
+
21
41
  return chunks
22
42
 
23
43
 
@@ -37,15 +57,23 @@ class VideoLocalLoader(BaseVideoLoader):
37
57
  origin: str = '',
38
58
  **kwargs
39
59
  ):
40
- super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
60
+ super().__init__(
61
+ tokenizer,
62
+ text_splitter,
63
+ source_type=source_type,
64
+ **kwargs
65
+ )
66
+ self.extract_frames: bool = kwargs.pop('extract_frames', False)
67
+ self.seconds_per_frame: int = kwargs.pop('seconds_per_frame', 1)
68
+ self.compress_speed: bool = kwargs.pop('compress_speed', False)
69
+ self.speed_factor: float = kwargs.pop('speed_factor', 1.5)
41
70
  self.path = path
42
71
 
43
72
  def load_video(self, path: PurePath) -> list:
44
73
  metadata = {
45
- "url": f"{path.name}",
74
+ "url": f"{path}",
46
75
  "source": f"{path}",
47
- "filename": f"{path}",
48
- # "index": path.stem,
76
+ "filename": f"{path.name}",
49
77
  "question": '',
50
78
  "answer": '',
51
79
  'type': 'video_transcript',
@@ -58,10 +86,17 @@ class VideoLocalLoader(BaseVideoLoader):
58
86
  }
59
87
  }
60
88
  documents = []
61
- transcript_path = path.with_suffix('.vtt')
89
+ transcript_path = path.with_suffix('.txt')
90
+ vtt_path = path.with_suffix('.vtt')
91
+ summary_path = path.with_suffix('.summary')
62
92
  audio_path = path.with_suffix('.mp3')
63
93
  # second: extract audio from File
64
- self.extract_audio(path, audio_path)
94
+ self.extract_audio(
95
+ path,
96
+ audio_path,
97
+ compress_speed=self.compress_speed,
98
+ speed_factor=self.speed_factor
99
+ )
65
100
  # get the Whisper parser
66
101
  transcript_whisper = self.get_whisper_transcript(audio_path)
67
102
  if transcript_whisper:
@@ -70,35 +105,71 @@ class VideoLocalLoader(BaseVideoLoader):
70
105
  transcript = ''
71
106
  # Summarize the transcript
72
107
  if transcript:
73
- # Split transcript into chunks
74
- transcript_chunks = split_text(transcript, 32767)
108
+ # first: extract summary, saving summary as a document:
75
109
  summary = self.get_summary_from_text(transcript)
76
- # Create Two Documents, one is for transcript, second is VTT:
77
- metadata['summary'] = summary
78
- for chunk in transcript_chunks:
110
+ self.saving_file(summary_path, summary.encode('utf-8'))
111
+ # second: saving transcript to a file:
112
+ self.saving_file(transcript_path, transcript.encode('utf-8'))
113
+ # Create Three Documents:
114
+ # one is for transcript
115
+ # split document only if size > 65.534
116
+ if len(transcript) > 65534:
117
+ # Split transcript into chunks
118
+ transcript_chunks = split_text(transcript, 32767)
119
+ for chunk in transcript_chunks:
120
+ doc = Document(
121
+ page_content=chunk,
122
+ metadata=metadata
123
+ )
124
+ documents.append(doc)
125
+ else:
79
126
  doc = Document(
80
- page_content=chunk,
127
+ page_content=transcript,
81
128
  metadata=metadata
82
129
  )
83
130
  documents.append(doc)
131
+ # second is Summary
132
+ if summary:
133
+ _meta = {
134
+ **metadata,
135
+ "type": 'video summary'
136
+ }
137
+ doc = Document(
138
+ page_content=summary,
139
+ metadata=_meta
140
+ )
141
+ # Third is VTT:
84
142
  if transcript_whisper:
85
143
  # VTT version:
86
- transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
87
- transcript_chunks = split_text(transcript, 65535)
88
- for chunk in transcript_chunks:
144
+ transcript = self.transcript_to_vtt(transcript_whisper, vtt_path)
145
+ _meta = {
146
+ **metadata,
147
+ "type": 'video subte vtt'
148
+ }
149
+ if len(transcript) > 65535:
150
+ transcript_chunks = split_text(transcript, 65535)
151
+ for chunk in transcript_chunks:
152
+ doc = Document(
153
+ page_content=chunk,
154
+ metadata=_meta
155
+ )
156
+ documents.append(doc)
157
+ else:
89
158
  doc = Document(
90
- page_content=chunk,
91
- metadata=metadata
159
+ page_content=transcript,
160
+ metadata=_meta
92
161
  )
93
162
  documents.append(doc)
94
163
  # Saving every dialog chunk as a separate document
95
164
  dialogs = self.transcript_to_blocks(transcript_whisper)
96
165
  docs = []
97
166
  for chunk in dialogs:
167
+ start_time = chunk['start_time']
98
168
  _meta = {
99
- # "index": f"{path.stem}:{chunk['id']}",
169
+ "source": f"{path.name}: min. {start_time}",
170
+ "type": "video dialog",
100
171
  "document_meta": {
101
- "start": f"{chunk['start_time']}",
172
+ "start": f"{start_time}",
102
173
  "end": f"{chunk['end_time']}",
103
174
  "id": f"{chunk['id']}",
104
175
  "language": self._language,
@@ -128,15 +199,61 @@ class VideoLocalLoader(BaseVideoLoader):
128
199
  documents.extend(self.load_video(item))
129
200
  return self.split_documents(documents)
130
201
 
202
+ def extract_video(self, path: PurePath) -> list:
203
+ metadata = {
204
+ "url": f"{path}",
205
+ "source": f"{path}",
206
+ "filename": f"{path.name}",
207
+ 'type': 'video_transcript',
208
+ "source_type": self._source_type,
209
+ "transcript": None,
210
+ "summary": None,
211
+ "vtt": None
212
+ }
213
+ transcript_path = path.with_suffix('.txt')
214
+ vtt_path = path.with_suffix('.vtt')
215
+ summary_path = path.with_suffix('.summary')
216
+ audio_path = path.with_suffix('.mp3')
217
+ # second: extract audio from File
218
+ self.extract_audio(
219
+ path,
220
+ audio_path,
221
+ compress_speed=self.compress_speed,
222
+ speed_factor=self.speed_factor
223
+ )
224
+ # get the Whisper parser
225
+ transcript_whisper = self.get_whisper_transcript(audio_path)
226
+ if transcript_whisper:
227
+ transcript = transcript_whisper['text']
228
+ else:
229
+ transcript = ''
230
+ # Summarize the transcript
231
+ if transcript:
232
+ # first: extract summary, saving summary as a document:
233
+ summary = self.get_summary_from_text(transcript)
234
+ self.saving_file(summary_path, summary.encode('utf-8'))
235
+ # second: saving transcript to a file:
236
+ self.saving_file(transcript_path, transcript.encode('utf-8'))
237
+ metadata['transcript'] = transcript_path
238
+ metadata["summary"] = summary
239
+ metadata['summary_file'] = summary_path
240
+ metadata["vtt"] = vtt_path
241
+ # Third is VTT:
242
+ if transcript_whisper:
243
+ # VTT version:
244
+ transcript = self.transcript_to_vtt(transcript_whisper, vtt_path)
245
+ return metadata
246
+
131
247
  def extract(self) -> list:
248
+ # Adding also Translation to other language.
132
249
  documents = []
133
250
  if self.path.is_file():
134
- docs = self.load_video(self.path)
135
- documents.extend(docs)
136
- if self.path.is_dir():
251
+ doc = self.extract_video(self.path)
252
+ documents.append(doc)
253
+ elif self.path.is_dir():
137
254
  # iterate over the files in the directory
138
255
  for ext in self._extension:
139
256
  for item in self.path.glob(f'*{ext}'):
140
257
  if set(item.parts).isdisjoint(self.skip_directories):
141
- documents.extend(self.load_video(item))
258
+ documents.append(self.extract_video(item))
142
259
  return documents
parrot/loaders/youtube.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from typing import Optional, Union
2
2
  from pytube import YouTube
3
3
  from youtube_transcript_api import NoTranscriptFound
4
- import torch
5
4
  from langchain.docstore.document import Document
6
5
  from langchain_community.document_loaders.parsers.audio import (
7
6
  OpenAIWhisperParserLocal
@@ -190,3 +189,53 @@ class YoutubeLoader(VideoLoader):
190
189
  metadata=metadata
191
190
  )
192
191
  return [doc]
192
+
193
+ def extract_video(
194
+ self,
195
+ url: str
196
+ ) -> list:
197
+ # first: load video metadata:
198
+ video_info = self.get_video_info(url)
199
+ # first: download video
200
+ file_path = self.download_video(url, self._video_path)
201
+ audio_path = file_path.with_suffix('.mp3')
202
+ transcript_path = file_path.with_suffix('.txt')
203
+ vtt_path = file_path.with_suffix('.vtt')
204
+ summary_path = file_path.with_suffix('.summary')
205
+ # second: extract audio
206
+ self.extract_audio(file_path, audio_path)
207
+ transcript_whisper = self.get_whisper_transcript(audio_path)
208
+ transcript = transcript_whisper['text']
209
+ # Summarize the transcript
210
+ try:
211
+ summary = self.get_summary_from_text(transcript)
212
+ self.saving_file(summary_path, summary.encode('utf-8'))
213
+ except Exception:
214
+ summary = ''
215
+ # Create Meta of Video Document
216
+ metadata = {
217
+ "url": f"{url}",
218
+ "source": f"{url}",
219
+ "source_type": self._source_type,
220
+ 'type': 'video_transcript',
221
+ "summary": f"{summary!s}",
222
+ "video_info": video_info
223
+ }
224
+ # VTT version:
225
+ transcript = self.transcript_to_vtt(transcript_whisper, vtt_path)
226
+ # second: saving transcript to a file:
227
+ self.saving_file(transcript_path, transcript.encode('utf-8'))
228
+ metadata['transcript'] = transcript_path
229
+ metadata["summary"] = summary
230
+ metadata['summary_file'] = summary_path
231
+ metadata["vtt"] = vtt_path
232
+ metadata['audio'] = audio_path
233
+ return metadata
234
+
235
+ def extract(self) -> list:
236
+ # Adding also Translation to other language.
237
+ documents = []
238
+ for url in self.urls:
239
+ doc = self.extract_video(url)
240
+ documents.append(doc)
241
+ return documents
@@ -0,0 +1,56 @@
1
+ """
2
+ Executable Python REPL Tool.
3
+ """
4
+ import io
5
+ import base64
6
+ import json
7
+ import matplotlib.pyplot as plt
8
+ from langchain_experimental.tools.python.tool import PythonAstREPLTool
9
+
10
+
11
+ class ExecutablePythonREPLTool(PythonAstREPLTool):
12
+ """
13
+ Executable Python REPL Tool.
14
+ """
15
+ def execute_code(self, code: str) -> str:
16
+ """
17
+ Execute the provided Python code and return the output.
18
+
19
+ Args:
20
+ code (str): The Python code to execute.
21
+
22
+ Returns:
23
+ str: The output of the executed code.
24
+ """
25
+ try:
26
+ # Set up a namespace for execution
27
+ namespace = {}
28
+ exec(code, namespace)
29
+
30
+ # Check if a plot was created
31
+ if 'plt' in namespace:
32
+ buf = io.BytesIO()
33
+ plt.savefig(buf, format='png')
34
+ plt.close()
35
+ buf.seek(0)
36
+ # Encode the image in base64
37
+ # Encode the image in base64
38
+ img_str = base64.b64encode(buf.read()).decode('utf-8')
39
+
40
+ # Prepare the JSON output
41
+ result = {
42
+ "image": {
43
+ "format": "png",
44
+ "base64": img_str
45
+ }
46
+ }
47
+ # Return both the code and the JSON result
48
+ return f"**Code Executed**:\n```python\n{code}\n```\n\n**Result**:\n{json.dumps(result)}"
49
+ else:
50
+ return f"**Code Executed**:\n```python\n{code}\n```\n\n"
51
+
52
+ except Exception as e:
53
+ return f"Error executing code: {e}"
54
+
55
+ def __call__(self, code: str) -> str:
56
+ return self.execute_code(code)
parrot/version.py CHANGED
@@ -3,7 +3,7 @@
3
3
  __title__ = "ai-parrot"
4
4
  __description__ = "Live Chatbots based on Langchain chatbots and Agents \
5
5
  Integrated into Navigator Framework or used into aiohttp applications."
6
- __version__ = "0.3.11"
6
+ __version__ = "0.3.16"
7
7
  __author__ = "Jesus Lara"
8
8
  __author_email__ = "jesuslarag@gmail.com"
9
9
  __license__ = "MIT"