ai-parrot 0.3.4__cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (109) hide show
  1. ai_parrot-0.3.4.dist-info/LICENSE +21 -0
  2. ai_parrot-0.3.4.dist-info/METADATA +319 -0
  3. ai_parrot-0.3.4.dist-info/RECORD +109 -0
  4. ai_parrot-0.3.4.dist-info/WHEEL +6 -0
  5. ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +21 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +728 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +366 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +83 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/odoo.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +578 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +110 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-39-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +162 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +137 -0
  40. parrot/llms/abstract.py +47 -0
  41. parrot/llms/anthropic.py +42 -0
  42. parrot/llms/google.py +42 -0
  43. parrot/llms/groq.py +45 -0
  44. parrot/llms/hf.py +45 -0
  45. parrot/llms/openai.py +59 -0
  46. parrot/llms/pipes.py +114 -0
  47. parrot/llms/vertex.py +78 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/audio.py +106 -0
  51. parrot/loaders/basepdf.py +102 -0
  52. parrot/loaders/basevideo.py +280 -0
  53. parrot/loaders/csv.py +42 -0
  54. parrot/loaders/dir.py +37 -0
  55. parrot/loaders/excel.py +349 -0
  56. parrot/loaders/github.py +65 -0
  57. parrot/loaders/handlers/__init__.py +5 -0
  58. parrot/loaders/handlers/data.py +213 -0
  59. parrot/loaders/image.py +119 -0
  60. parrot/loaders/json.py +52 -0
  61. parrot/loaders/pdf.py +437 -0
  62. parrot/loaders/pdfchapters.py +142 -0
  63. parrot/loaders/pdffn.py +112 -0
  64. parrot/loaders/pdfimages.py +207 -0
  65. parrot/loaders/pdfmark.py +88 -0
  66. parrot/loaders/pdftables.py +145 -0
  67. parrot/loaders/ppt.py +30 -0
  68. parrot/loaders/qa.py +81 -0
  69. parrot/loaders/repo.py +103 -0
  70. parrot/loaders/rtd.py +65 -0
  71. parrot/loaders/txt.py +92 -0
  72. parrot/loaders/utils/__init__.py +1 -0
  73. parrot/loaders/utils/models.py +25 -0
  74. parrot/loaders/video.py +96 -0
  75. parrot/loaders/videolocal.py +120 -0
  76. parrot/loaders/vimeo.py +106 -0
  77. parrot/loaders/web.py +216 -0
  78. parrot/loaders/web_base.py +112 -0
  79. parrot/loaders/word.py +125 -0
  80. parrot/loaders/youtube.py +192 -0
  81. parrot/manager.py +166 -0
  82. parrot/models.py +372 -0
  83. parrot/py.typed +0 -0
  84. parrot/stores/__init__.py +48 -0
  85. parrot/stores/abstract.py +171 -0
  86. parrot/stores/milvus.py +632 -0
  87. parrot/stores/qdrant.py +153 -0
  88. parrot/tools/__init__.py +12 -0
  89. parrot/tools/abstract.py +53 -0
  90. parrot/tools/asknews.py +32 -0
  91. parrot/tools/bing.py +13 -0
  92. parrot/tools/duck.py +62 -0
  93. parrot/tools/google.py +170 -0
  94. parrot/tools/stack.py +26 -0
  95. parrot/tools/weather.py +70 -0
  96. parrot/tools/wikipedia.py +59 -0
  97. parrot/tools/zipcode.py +179 -0
  98. parrot/utils/__init__.py +2 -0
  99. parrot/utils/parsers/__init__.py +5 -0
  100. parrot/utils/parsers/toml.cpython-39-x86_64-linux-gnu.so +0 -0
  101. parrot/utils/toml.py +11 -0
  102. parrot/utils/types.cpython-39-x86_64-linux-gnu.so +0 -0
  103. parrot/utils/uv.py +11 -0
  104. parrot/version.py +10 -0
  105. resources/users/__init__.py +5 -0
  106. resources/users/handlers.py +13 -0
  107. resources/users/models.py +205 -0
  108. settings/__init__.py +0 -0
  109. settings/settings.py +51 -0
@@ -0,0 +1,106 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import PurePath
4
+ from langchain.docstore.document import Document
5
+ from .basevideo import BaseVideoLoader
6
+
7
+
8
+ class AudioLoader(BaseVideoLoader):
9
+ """
10
+ Generating transcripts from local Audio.
11
+ """
12
+ _extension = ['.mp3', '.webm']
13
+
14
+ def __init__(
15
+ self,
16
+ path: PurePath,
17
+ tokenizer: Callable[..., Any] = None,
18
+ text_splitter: Callable[..., Any] = None,
19
+ source_type: str = 'documentation',
20
+ encoding: str = 'utf-8',
21
+ origin: str = '',
22
+ **kwargs
23
+ ):
24
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
25
+ self.path = path
26
+
27
+ def load_audio(self, path: PurePath) -> list:
28
+ metadata = {
29
+ "source": f"{path}",
30
+ "url": f"{path.name}",
31
+ "index": path.stem,
32
+ "filename": f"{path}",
33
+ "question": '',
34
+ "answer": '',
35
+ 'type': 'audio_transcript',
36
+ "source_type": self._source_type,
37
+ "summary": '',
38
+ "document_meta": {
39
+ "language": self._language,
40
+ "topic_tags": ""
41
+ }
42
+ }
43
+ documents = []
44
+ transcript_path = path.with_suffix('.vtt')
45
+ # get the Whisper parser
46
+ transcript_whisper = self.get_whisper_transcript(path)
47
+ if transcript_whisper:
48
+ transcript = transcript_whisper['text']
49
+ else:
50
+ transcript = ''
51
+ # Summarize the transcript
52
+ if transcript:
53
+ summary = self.get_summary_from_text(transcript)
54
+ # Create Two Documents, one is for transcript, second is VTT:
55
+ metadata['summary'] = summary
56
+ doc = Document(
57
+ page_content=transcript,
58
+ metadata=metadata
59
+ )
60
+ documents.append(doc)
61
+ if transcript_whisper:
62
+ # VTT version:
63
+ transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
64
+ doc = Document(
65
+ page_content=transcript,
66
+ metadata=metadata
67
+ )
68
+ documents.append(doc)
69
+ # Saving every dialog chunk as a separate document
70
+ dialogs = self.transcript_to_blocks(transcript_whisper)
71
+ docs = []
72
+ for chunk in dialogs:
73
+ _meta = {
74
+ "index": f"{path.stem}:{chunk['id']}",
75
+ "document_meta": {
76
+ "start": f"{chunk['start_time']}",
77
+ "end": f"{chunk['end_time']}",
78
+ "id": f"{chunk['id']}",
79
+ "language": self._language,
80
+ "title": f"{path.stem}",
81
+ "topic_tags": ""
82
+ }
83
+ }
84
+ _info = {**metadata, **_meta}
85
+ doc = Document(
86
+ page_content=chunk['text'],
87
+ metadata=_info
88
+ )
89
+ docs.append(doc)
90
+ documents.extend(docs)
91
+ return documents
92
+
93
+ def load(self) -> list:
94
+ documents = []
95
+ if self.path.is_file():
96
+ docs = self.load_audio(self.path)
97
+ documents.extend(docs)
98
+ if self.path.is_dir():
99
+ # iterate over the files in the directory
100
+ for ext in self._extension:
101
+ for item in self.path.glob(f'*{ext}'):
102
+ if set(item.parts).isdisjoint(self.skip_directories):
103
+ documents.extend(
104
+ self.load_audio(item)
105
+ )
106
+ return self.split_documents(documents)
@@ -0,0 +1,102 @@
1
+ from collections.abc import Callable
2
+ from typing import Any
3
+ from abc import abstractmethod
4
+ from pathlib import Path, PurePath
5
+ from PIL import Image
6
+ from .abstract import AbstractLoader
7
+ from ..conf import STATIC_DIR
8
+
9
+
10
+ class BasePDF(AbstractLoader):
11
+ """
12
+ Base Abstract loader for all PDF files.
13
+ """
14
+ _extension = ['.pdf']
15
+
16
+ def __init__(
17
+ self,
18
+ path: PurePath,
19
+ tokenizer: Callable[..., Any] = None,
20
+ text_splitter: Callable[..., Any] = None,
21
+ source_type: str = 'pdf',
22
+ language: str = "eng",
23
+ **kwargs
24
+ ):
25
+ super().__init__(tokenizer, text_splitter, source_type=source_type, **kwargs)
26
+ self.path = path
27
+ if isinstance(path, str):
28
+ self.path = Path(path).resolve()
29
+ self.save_images: bool = bool(kwargs.get('save_images', False))
30
+ self._imgdir = STATIC_DIR.joinpath('images')
31
+ if self.save_images is True:
32
+ if self._imgdir.exists() is False:
33
+ self._imgdir.mkdir(parents=True, exist_ok=True)
34
+ if language == 'en':
35
+ language = 'eng'
36
+ self._lang = language
37
+
38
+ def save_image(self, img_stream: Image, image_name: str, save_path: Path):
39
+ # Create the image directory if it does not exist
40
+ if save_path.exists() is False:
41
+ save_path.mkdir(parents=True, exist_ok=True)
42
+ img_path = save_path.joinpath(image_name)
43
+ self.logger.notice(
44
+ f"Saving Image Page on {img_path}"
45
+ )
46
+ if not img_path.exists():
47
+ # Save the image
48
+ img_stream.save(img_path, format="PNG", optimize=True)
49
+ return img_path
50
+
51
+ @abstractmethod
52
+ def _load_pdf(self, path: Path) -> list:
53
+ """
54
+ Load a PDF file using Fitz.
55
+
56
+ Args:
57
+ path (Path): The path to the PDF file.
58
+
59
+ Returns:
60
+ list: A list of Langchain Documents.
61
+ """
62
+ pass
63
+
64
+ def load(self) -> list:
65
+ """
66
+ Load data from a PDF file.
67
+
68
+ Args:
69
+ source (str): The path to the PDF file.
70
+
71
+ Returns:
72
+ list: A list of Langchain Documents.
73
+ """
74
+ if isinstance(self.path, list):
75
+ # list of files:
76
+ documents = []
77
+ for p in self.path:
78
+ if self._check_path(p):
79
+ documents.extend(self._load_pdf(p))
80
+ if not self.path.exists():
81
+ raise FileNotFoundError(
82
+ f"PDF file/directory not found: {self.path}"
83
+ )
84
+ if self.path.is_dir():
85
+ documents = []
86
+ # iterate over the files in the directory
87
+ for ext in self._extension:
88
+ for item in self.path.glob(f'*{ext}'):
89
+ if set(item.parts).isdisjoint(self.skip_directories):
90
+ documents.extend(self._load_pdf(item))
91
+ elif self.path.is_file():
92
+ documents = self._load_pdf(self.path)
93
+ else:
94
+ raise ValueError(
95
+ f"PDF Loader: Invalid path: {self.path}"
96
+ )
97
+ return self.split_documents(documents)
98
+
99
+ def parse(self, source):
100
+ raise NotImplementedError(
101
+ "Parser method is not implemented for PDFLoader."
102
+ )
@@ -0,0 +1,280 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Union, List
3
+ from abc import abstractmethod
4
+ from pathlib import Path
5
+ from moviepy.editor import VideoFileClip
6
+ from transformers import (
7
+ pipeline,
8
+ AutoModelForSeq2SeqLM,
9
+ AutoTokenizer,
10
+ GenerationConfig
11
+ )
12
+ from transformers import (
13
+ WhisperForConditionalGeneration,
14
+ WhisperProcessor,
15
+ WhisperTimeStampLogitsProcessor
16
+ )
17
+
18
+ from langchain.chains.summarize import load_summarize_chain
19
+ from langchain.text_splitter import (
20
+ TokenTextSplitter,
21
+ )
22
+ from .abstract import AbstractLoader
23
+
24
+
25
+ def extract_video_id(url):
26
+ parts = url.split("?v=")
27
+ video_id = parts[1].split("&")[0]
28
+ return video_id
29
+
30
+
31
+ class BaseVideoLoader(AbstractLoader):
32
+ """
33
+ Generating Video transcripts from Videos.
34
+ """
35
+ _extension = ['.youtube']
36
+ encoding = 'utf-8'
37
+ chunk_size = 768
38
+
39
+ def __init__(
40
+ self,
41
+ urls: List[str],
42
+ tokenizer: Callable[..., Any] = None,
43
+ text_splitter: Callable[..., Any] = None,
44
+ source_type: str = 'video',
45
+ language: str = "en",
46
+ video_path: Union[str, Path] = None,
47
+ **kwargs
48
+ ):
49
+ super().__init__(tokenizer, text_splitter, source_type, **kwargs)
50
+ self.urls = urls
51
+ self._task = kwargs.get('task', "automatic-speech-recognition")
52
+ # Topics:
53
+ self.topics: list = kwargs.get('topics', [])
54
+ self._model_size: str = kwargs.get('model_size', 'medium')
55
+ self.summarization_model = "facebook/bart-large-cnn"
56
+ self._model_name: str = kwargs.get('model_name', 'whisper')
57
+ self.summarizer = pipeline(
58
+ "summarization",
59
+ tokenizer=AutoTokenizer.from_pretrained(
60
+ self.summarization_model
61
+ ),
62
+ model=AutoModelForSeq2SeqLM.from_pretrained(
63
+ self.summarization_model
64
+ ),
65
+ device=self._get_device()
66
+ )
67
+ # language:
68
+ self._language = language
69
+ # directory:
70
+ if isinstance(video_path, str):
71
+ self._video_path = Path(video_path).resolve()
72
+ self._video_path = video_path
73
+
74
+ def transcript_to_vtt(self, transcript: str, transcript_path: Path) -> str:
75
+ """
76
+ Convert a transcript to VTT format.
77
+ """
78
+ vtt = "WEBVTT\n\n"
79
+ for i, chunk in enumerate(transcript['chunks'], start=1):
80
+ start, end = chunk['timestamp']
81
+ text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
82
+
83
+ if start is None or end is None:
84
+ print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
85
+ continue
86
+
87
+ # Convert timestamps to WebVTT format (HH:MM:SS.MMM)
88
+ start_vtt = f"{int(start // 3600):02}:{int(start % 3600 // 60):02}:{int(start % 60):02}.{int(start * 1000 % 1000):03}"
89
+ end_vtt = f"{int(end // 3600):02}:{int(end % 3600 // 60):02}:{int(end % 60):02}.{int(end * 1000 % 1000):03}"
90
+
91
+ vtt += f"{i}\n{start_vtt} --> {end_vtt}\n{text}\n\n"
92
+ # Save the VTT file
93
+ try:
94
+ with open(str(transcript_path), "w") as f:
95
+ f.write(vtt)
96
+ print(f'Saved VTT File on {transcript_path}')
97
+ except Exception as exc:
98
+ print(f"Error saving VTT file: {exc}")
99
+ return vtt
100
+
101
+ def format_timestamp(self, seconds):
102
+ # This helper function takes the total seconds and formats it into hh:mm:ss,ms
103
+ hours, remainder = divmod(int(seconds), 3600)
104
+ minutes, seconds = divmod(remainder, 60)
105
+ milliseconds = int((seconds % 1) * 1000)
106
+ seconds = int(seconds)
107
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
108
+
109
+ def transcript_to_blocks(self, transcript: str) -> list:
110
+ """
111
+ Convert a transcript to blocks.
112
+ """
113
+ blocks = []
114
+ for i, chunk in enumerate(transcript['chunks'], start=1):
115
+ current_window = {}
116
+ start, end = chunk['timestamp']
117
+ if start is None or end is None:
118
+ print(f"Warning: Missing timestamp for chunk {i}, skipping this chunk.")
119
+ continue
120
+
121
+ start_srt = self.format_timestamp(start)
122
+ end_srt = self.format_timestamp(end)
123
+ text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
124
+ current_window['id'] = i
125
+ current_window['start_time'] = start_srt
126
+ current_window['end_time'] = end_srt
127
+ current_window['text'] = text
128
+ blocks.append(current_window)
129
+ return blocks
130
+
131
+ def transcript_to_srt(self, transcript: str) -> str:
132
+ """
133
+ Convert a transcript to SRT format.
134
+ """
135
+ # lines = transcript.split("\n")
136
+ srt = ""
137
+ for i, chunk in enumerate(transcript['chunks'], start=1):
138
+ start, end = chunk['timestamp']
139
+ text = chunk['text'].replace("\n", " ") # Replace newlines in text with spaces
140
+ # Convert start and end times to SRT format HH:MM:SS,MS
141
+ start_srt = f"{start // 3600:02}:{start % 3600 // 60:02}:{start % 60:02},{int(start * 1000 % 1000):03}"
142
+ end_srt = f"{end // 3600:02}:{end % 3600 // 60:02}:{end % 60:02},{int(end * 1000 % 1000):03}"
143
+ srt += f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n"
144
+ return srt
145
+
146
+ def chunk_text(self, text, chunk_size, tokenizer):
147
+ # Tokenize the text and get the number of tokens
148
+ tokens = tokenizer.tokenize(text)
149
+ # Split the tokens into chunks
150
+ for i in range(0, len(tokens), chunk_size):
151
+ yield tokenizer.convert_tokens_to_string(
152
+ tokens[i:i+chunk_size]
153
+ )
154
+
155
+ def get_summary(self, documents: list) -> str:
156
+ """
157
+ Get a summary of a text.
158
+ """
159
+ try:
160
+ splitter = TokenTextSplitter(
161
+ chunk_size=5000,
162
+ chunk_overlap=100,
163
+ )
164
+ summarize_chain = load_summarize_chain(
165
+ llm=self._llm,
166
+ chain_type="refine"
167
+ )
168
+ chunks = splitter.split_documents(documents)
169
+ summary = summarize_chain.invoke(chunks)
170
+ return summary
171
+ except Exception as e:
172
+ print('ERROR in get_summary:', e)
173
+ return ""
174
+
175
+ def summarization(self, text: str) -> str:
176
+ """
177
+ Get a summary of a text considering token limits.
178
+ """
179
+ try:
180
+ tokenizer = self.summarizer.tokenizer
181
+ # to be safe under the limit
182
+ max_length = tokenizer.model_max_length - 10
183
+ summaries = []
184
+ for text_chunk in self.chunk_text(text, max_length, tokenizer):
185
+ chunk_summary = self.summarizer(
186
+ text_chunk,
187
+ max_length=150,
188
+ min_length=30,
189
+ do_sample=False)[0]['summary_text']
190
+ summaries.append(chunk_summary)
191
+ return " ".join(summaries)
192
+ except Exception as e:
193
+ print('ERROR in summarization:', e)
194
+ return ""
195
+
196
+ def extract_audio(self, video_path, audio_path):
197
+ """
198
+ Extracts the audio from a video file and saves it as an audio file.
199
+
200
+ Args:
201
+ video_path (str): Path to the video file.
202
+ audio_path (str): Path where the extracted audio file will be saved.
203
+ """
204
+ if audio_path.exists():
205
+ print(f"Audio already extracted: {audio_path}")
206
+ return
207
+ video_clip = VideoFileClip(str(video_path))
208
+ audio_clip = video_clip.audio
209
+ if not audio_clip:
210
+ return
211
+ audio_clip.write_audiofile(str(audio_path))
212
+ audio_clip.close()
213
+ video_clip.close()
214
+
215
+ def get_whisper_transcript(self, audio_path: Path, chunk_length: int = 30):
216
+ # Initialize the Whisper parser
217
+ if self._model_name == 'whisper':
218
+ if self._language == 'en':
219
+ model_name = f"openai/whisper-{self._model_size}.en"
220
+ elif self._language == 'es':
221
+ model_name = f"juancopi81/whisper-{self._model_size}-es"
222
+ else:
223
+ model_name = "openai/whisper-large-v3"
224
+ else:
225
+ model_name = self._model_name
226
+
227
+ # Load the model and processor
228
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
229
+ processor = WhisperProcessor.from_pretrained(model_name)
230
+
231
+ # Try to load the generation config, fallback to default if it doesn't exist
232
+ try:
233
+ generation_config = GenerationConfig.from_pretrained(model_name)
234
+ except EnvironmentError:
235
+ print(
236
+ f"Warning: No generation_config.json found for model {model_name}. Using default configuration."
237
+ )
238
+ generation_config = model.generation_config
239
+
240
+ # Check and set the no_timestamps_token_id if it doesn't exist
241
+ if not hasattr(model.config, 'no_timestamps_token_id'):
242
+ model.config.no_timestamps_token_id = processor.tokenizer.convert_tokens_to_ids('<|notimestamps|>')
243
+
244
+ # Define the generation configuration with WhisperTimeStampLogitsProcessor
245
+ try:
246
+ model.config.logits_processor = [
247
+ WhisperTimeStampLogitsProcessor(generation_config)
248
+ ]
249
+ except Exception:
250
+ model.config.logits_processor = [
251
+ WhisperTimeStampLogitsProcessor(model.config)
252
+ ]
253
+
254
+ whisper_pipe = pipeline(
255
+ task=self._task,
256
+ model=model,
257
+ tokenizer=processor.tokenizer,
258
+ feature_extractor=processor.feature_extractor,
259
+ device=self._get_device(),
260
+ chunk_length_s=chunk_length,
261
+ use_fast=True
262
+ )
263
+ if audio_path.exists() and audio_path.stat().st_size > 0:
264
+ # Use the parser to extract transcript
265
+ return whisper_pipe(
266
+ str(audio_path),
267
+ return_timestamps=True
268
+ )
269
+ return None
270
+
271
+ @abstractmethod
272
+ def load(self) -> list:
273
+ pass
274
+
275
+ @abstractmethod
276
+ def load_video(self, url: str, video_title: str, transcript: str) -> list:
277
+ pass
278
+
279
+ def parse(self, source):
280
+ pass
parrot/loaders/csv.py ADDED
@@ -0,0 +1,42 @@
1
+ from pathlib import PurePath
2
+ from langchain_community.document_loaders.csv_loader import CSVLoader as CSVL
3
+ from .abstract import AbstractLoader
4
+
5
+
6
+ class CSVLoader(AbstractLoader):
7
+ """
8
+ Loader for CSV files.
9
+ """
10
+ _extension = ['.csv']
11
+ csv_args: dict = {
12
+ "delimiter": ",",
13
+ "quotechar": '"',
14
+ "escapechar": "\\",
15
+ "skipinitialspace": False,
16
+ "lineterminator": "\n",
17
+ "quoting": 0,
18
+ "skiprows": 0,
19
+ "encoding": None
20
+ }
21
+
22
+ def load(self, path: PurePath) -> list:
23
+ """
24
+ Load data from a CSV file.
25
+
26
+ Args:
27
+ source (str): The path to the CSV file.
28
+
29
+ Returns:
30
+ list: A list of Langchain Documents.
31
+ """
32
+ if self._check_path(path):
33
+ self.logger.info(f"Loading CSV file: {path}")
34
+ loader = CSVL(
35
+ file_path=path,
36
+ csv_args=self.csv_args,
37
+ autodetect_encoding=True
38
+ )
39
+ documents = loader.load()
40
+ return self.split_documents(documents)
41
+ else:
42
+ return []
parrot/loaders/dir.py ADDED
@@ -0,0 +1,37 @@
1
+ import os
2
+ import glob
3
+ from pathlib import PurePath
4
+ from langchain_community.document_loaders import (
5
+ DirectoryLoader
6
+ )
7
+
8
+
9
+ def load_directory(
10
+ path: PurePath,
11
+ text_splitter,
12
+ source_type,
13
+ file_pattern: str = "**/*.txt"
14
+ ):
15
+ """
16
+ Load all Text documents from a directory.
17
+
18
+ Args:
19
+ path (str): The path to the directory.
20
+ text_splitter (TextSplitter): A text splitter object.
21
+ source_type (str): The type of source.
22
+
23
+ Returns:
24
+ list: A list of documents.
25
+ """
26
+ documents = []
27
+ loader = DirectoryLoader(
28
+ path=str(path),
29
+ glob=file_pattern,
30
+ recursive=True,
31
+ show_progress=True,
32
+ use_multithreading=True
33
+ )
34
+ documents = loader.load()
35
+ for doc in documents:
36
+ doc.metadata['source_type'] = source_type
37
+ return text_splitter.split_documents(documents)