ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (109) hide show
  1. ai_parrot-0.3.4.dist-info/LICENSE +21 -0
  2. ai_parrot-0.3.4.dist-info/METADATA +319 -0
  3. ai_parrot-0.3.4.dist-info/RECORD +109 -0
  4. ai_parrot-0.3.4.dist-info/WHEEL +6 -0
  5. ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +21 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +728 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +366 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +83 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/odoo.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +578 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +110 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +162 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +137 -0
  40. parrot/llms/abstract.py +47 -0
  41. parrot/llms/anthropic.py +42 -0
  42. parrot/llms/google.py +42 -0
  43. parrot/llms/groq.py +45 -0
  44. parrot/llms/hf.py +45 -0
  45. parrot/llms/openai.py +59 -0
  46. parrot/llms/pipes.py +114 -0
  47. parrot/llms/vertex.py +78 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/audio.py +106 -0
  51. parrot/loaders/basepdf.py +102 -0
  52. parrot/loaders/basevideo.py +280 -0
  53. parrot/loaders/csv.py +42 -0
  54. parrot/loaders/dir.py +37 -0
  55. parrot/loaders/excel.py +349 -0
  56. parrot/loaders/github.py +65 -0
  57. parrot/loaders/handlers/__init__.py +5 -0
  58. parrot/loaders/handlers/data.py +213 -0
  59. parrot/loaders/image.py +119 -0
  60. parrot/loaders/json.py +52 -0
  61. parrot/loaders/pdf.py +437 -0
  62. parrot/loaders/pdfchapters.py +142 -0
  63. parrot/loaders/pdffn.py +112 -0
  64. parrot/loaders/pdfimages.py +207 -0
  65. parrot/loaders/pdfmark.py +88 -0
  66. parrot/loaders/pdftables.py +145 -0
  67. parrot/loaders/ppt.py +30 -0
  68. parrot/loaders/qa.py +81 -0
  69. parrot/loaders/repo.py +103 -0
  70. parrot/loaders/rtd.py +65 -0
  71. parrot/loaders/txt.py +92 -0
  72. parrot/loaders/utils/__init__.py +1 -0
  73. parrot/loaders/utils/models.py +25 -0
  74. parrot/loaders/video.py +96 -0
  75. parrot/loaders/videolocal.py +120 -0
  76. parrot/loaders/vimeo.py +106 -0
  77. parrot/loaders/web.py +216 -0
  78. parrot/loaders/web_base.py +112 -0
  79. parrot/loaders/word.py +125 -0
  80. parrot/loaders/youtube.py +192 -0
  81. parrot/manager.py +166 -0
  82. parrot/models.py +372 -0
  83. parrot/py.typed +0 -0
  84. parrot/stores/__init__.py +48 -0
  85. parrot/stores/abstract.py +171 -0
  86. parrot/stores/milvus.py +632 -0
  87. parrot/stores/qdrant.py +153 -0
  88. parrot/tools/__init__.py +12 -0
  89. parrot/tools/abstract.py +53 -0
  90. parrot/tools/asknews.py +32 -0
  91. parrot/tools/bing.py +13 -0
  92. parrot/tools/duck.py +62 -0
  93. parrot/tools/google.py +170 -0
  94. parrot/tools/stack.py +26 -0
  95. parrot/tools/weather.py +70 -0
  96. parrot/tools/wikipedia.py +59 -0
  97. parrot/tools/zipcode.py +179 -0
  98. parrot/utils/__init__.py +2 -0
  99. parrot/utils/parsers/__init__.py +5 -0
  100. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  101. parrot/utils/toml.py +11 -0
  102. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  103. parrot/utils/uv.py +11 -0
  104. parrot/version.py +10 -0
  105. resources/users/__init__.py +5 -0
  106. resources/users/handlers.py +13 -0
  107. resources/users/models.py +205 -0
  108. settings/__init__.py +0 -0
  109. settings/settings.py +51 -0
@@ -0,0 +1,106 @@
1
+ from typing import Optional, Union
2
+ from transformers import pipeline
3
+ import torch
4
+ from langchain.docstore.document import Document
5
+ from .youtube import YoutubeLoader
6
+
7
+
8
+ class VimeoLoader(YoutubeLoader):
9
+ """
10
+ Loader for Vimeo videos.
11
+ """
12
+ def load_video(self, url: str, video_title: str, transcript: Optional[Union[str, None]] = None) -> list:
13
+ metadata = {
14
+ "source": url,
15
+ "url": url,
16
+ "index": url,
17
+ "filename": video_title,
18
+ "question": '',
19
+ "answer": '',
20
+ 'type': 'video_transcript',
21
+ "source_type": self._source_type,
22
+ "summary": '',
23
+ "document_meta": {
24
+ "language": self._language,
25
+ "title": video_title,
26
+ "topic_tags": ""
27
+ }
28
+ }
29
+ if self.topics:
30
+ metadata['document_meta']['topic_tags'] = self.topics
31
+ if transcript is None:
32
+ documents = []
33
+ docs = []
34
+ # first: download video
35
+ try:
36
+ file_path = self.download_video(url, self._video_path)
37
+ except Exception:
38
+ return []
39
+ if not file_path:
40
+ self.logger.warning(
41
+ f"Error downloading File for video: {self._video_path}"
42
+ )
43
+ return []
44
+ transcript_path = file_path.with_suffix('.vtt')
45
+ audio_path = file_path.with_suffix('.mp3')
46
+ # second: extract audio
47
+ self.extract_audio(file_path, audio_path)
48
+ # get the Whisper parser
49
+ transcript_whisper = self.get_whisper_transcript(audio_path)
50
+ if transcript_whisper:
51
+ transcript = transcript_whisper['text']
52
+ else:
53
+ transcript = ''
54
+ # Summarize the transcript
55
+ if transcript:
56
+ summary = self.get_summary_from_text(transcript)
57
+ # Create Two Documents, one is for transcript, second is VTT:
58
+ metadata['summary'] = summary
59
+ doc = Document(
60
+ page_content=transcript,
61
+ metadata=metadata
62
+ )
63
+ documents.append(doc)
64
+ if transcript_whisper:
65
+ # VTT version:
66
+ transcript = self.transcript_to_vtt(transcript_whisper, transcript_path)
67
+ doc = Document(
68
+ page_content=transcript,
69
+ metadata=metadata
70
+ )
71
+ documents.append(doc)
72
+ # Saving every dialog chunk as a separate document
73
+ dialogs = self.transcript_to_blocks(transcript_whisper)
74
+ for chunk in dialogs:
75
+ _meta = {
76
+ "index": f"{video_title}:{chunk['id']}",
77
+ "document_meta": {
78
+ "start": f"{chunk['start_time']}",
79
+ "end": f"{chunk['end_time']}",
80
+ "id": f"{chunk['id']}",
81
+ "language": self._language,
82
+ "title": video_title,
83
+ "topic_tags": ""
84
+ }
85
+ }
86
+ _info = {**metadata, **_meta}
87
+ doc = Document(
88
+ page_content=chunk['text'],
89
+ metadata=_info
90
+ )
91
+ docs.append(doc)
92
+ documents.extend(docs)
93
+ return self.split_documents(documents)
94
+ else:
95
+ # using the transcript file
96
+ with open(transcript, 'r') as f:
97
+ transcript = f.read()
98
+ summary = self.get_summary_from_text(transcript)
99
+ transcript_whisper = None
100
+ metadata['summary'] = f"{summary!s}"
101
+ # Create Two Documents, one is for transcript, second is VTT:
102
+ doc = Document(
103
+ page_content=transcript,
104
+ metadata=metadata
105
+ )
106
+ return self.split_documents([doc])
parrot/loaders/web.py ADDED
@@ -0,0 +1,216 @@
1
+ from bs4 import BeautifulSoup
2
+ from markdownify import MarkdownConverter
3
+ from webdriver_manager.chrome import ChromeDriverManager
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from selenium.webdriver.chrome.options import Options
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from selenium.webdriver.support import expected_conditions as EC
10
+ from langchain.docstore.document import Document
11
+ from langchain.text_splitter import MarkdownTextSplitter
12
+ from navconfig.logging import logging
13
+ from .abstract import AbstractLoader
14
+
15
+
16
+ logging.getLogger(name='selenium.webdriver').setLevel(logging.WARNING)
17
+ logging.getLogger(name='WDM').setLevel(logging.WARNING)
18
+ logging.getLogger(name='matplotlib').setLevel(logging.WARNING)
19
+
20
+
21
+ class WebLoader(AbstractLoader):
22
+ """Class to load web pages and extract text."""
23
+ chrome_options = [
24
+ "--headless",
25
+ "--enable-automation",
26
+ "--lang=en",
27
+ "--disable-extensions",
28
+ "--disable-gpu",
29
+ "--no-sandbox",
30
+ "--disable-features=NetworkService",
31
+ "--disable-dev-shm-usage",
32
+ "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
33
+ ]
34
+ def __init__(self, urls: dict, source_type: str = 'website', **kwargs):
35
+ self.urls = urls
36
+ self._source_type = source_type
37
+ self._options = Options()
38
+ self.timeout: int = kwargs.pop('timeout', 60)
39
+ for option in self.chrome_options:
40
+ self._options.add_argument(option)
41
+ self.driver = webdriver.Chrome(
42
+ service=Service(ChromeDriverManager().install()),
43
+ options=self._options
44
+ )
45
+ self._mark_splitter = MarkdownTextSplitter(
46
+ chunk_size = 1024,
47
+ chunk_overlap=10
48
+ )
49
+ super().__init__(source_type=source_type, **kwargs)
50
+
51
+ def md(self, soup, **options):
52
+ return MarkdownConverter(**options).convert_soup(soup)
53
+
54
+ def clean_html(self, html, tags, objects=[]):
55
+ soup = BeautifulSoup(html, 'html.parser')
56
+ page_title = soup.title.string
57
+ md_text = self.md(soup)
58
+ # Remove script and style elements
59
+ for script_or_style in soup(["script", "style", "link"]):
60
+ script_or_style.decompose()
61
+ # Extract Content
62
+ content = []
63
+ paragraphs = [' '.join(p.get_text().split()) for p in soup.find_all(tags)]
64
+ # Look for iframe elements and format their src attributes into readable strings
65
+ iframes = soup.find_all('iframe')
66
+ for iframe in iframes:
67
+ video_src = iframe.get('src', '')
68
+ # You might want to customize the formatting of this string
69
+ formatted_video = f"Video Link: {video_src}" if video_src else ""
70
+ content.append(formatted_video)
71
+ if objects:
72
+ for obj in objects:
73
+ (element, args), = obj.items()
74
+ if 'parse_list' in args:
75
+ parse_list = args.pop('parse_list')
76
+ # Find the element container:
77
+ container = soup.find(element, attrs=args)
78
+ # Parse list of objects (UL, LI)
79
+ name_type = parse_list.pop('type')
80
+ params = parse_list.get('find')
81
+ el = params.pop(0)
82
+ try:
83
+ attrs = params.pop(0)
84
+ except IndexError:
85
+ attrs = {}
86
+ elements = container.find_all(el, attrs=attrs)
87
+ structured_text = ''
88
+ for element in elements:
89
+ title = element.find('span', class_='title').get_text(strip=True)
90
+ lists = element.find_all('ul')
91
+ if lists:
92
+ structured_text += f"\nCategory: {title}\n{name_type}:\n"
93
+ for ul in lists:
94
+ items = [f"- {li.get_text(strip=True)}" for li in ul.select('li')]
95
+ formatted_list = '\n'.join(items)
96
+ structured_text += formatted_list
97
+ structured_text += "\n"
98
+ content.append(structured_text)
99
+ else:
100
+ elements = soup.find_all(element, attrs=args)
101
+ for element in elements:
102
+ # Handle <a> tags within the current element
103
+ links = element.find_all('a')
104
+ for link in links:
105
+ # Extract link text and href, format them into a readable string
106
+ link_text = link.get_text(strip=True)
107
+ href = link.get('href', '')
108
+ formatted_link = (
109
+ f"{link_text} (Link: {href})"
110
+ if href
111
+ else link_text
112
+ )
113
+ # Replace the original link text in the element
114
+ # with the formatted version
115
+ link.replace_with(formatted_link)
116
+ # work with UL lists:
117
+ lists = element.find_all('ul')
118
+ for ul in lists:
119
+ items = [li.get_text(strip=True) for li in ul.select('li')]
120
+ formatted_list = '\n'.join(items)
121
+ content.append(formatted_list)
122
+ cleaned_text = ' '.join(element.get_text().split())
123
+ content.append(cleaned_text)
124
+ return (content + paragraphs, md_text, page_title)
125
+
126
+ def get(self, address: dict) -> list:
127
+ (url, args), = address.items()
128
+ self.logger.info(
129
+ f'Downloading URL {url} with args {args}'
130
+ )
131
+ locator = args.get('locator', (By.TAG_NAME, 'body'))
132
+ wait = WebDriverWait(self.driver, self.timeout)
133
+ acookies = args.get('accept_cookies', False)
134
+ try:
135
+ self.driver.get(url)
136
+ # After loading page, accept cookies
137
+ wait.until(
138
+ EC.presence_of_element_located(
139
+ locator
140
+ )
141
+ )
142
+ if acookies:
143
+ btn = wait.until(
144
+ EC.element_to_be_clickable(
145
+ acookies
146
+ )
147
+ )
148
+ btn.click()
149
+ except Exception as exc:
150
+ print(f"Failed to Get {url}: {exc}")
151
+ self.logger.exception(
152
+ str(exc), stack_info=True
153
+ )
154
+ raise
155
+ try:
156
+ extract = args.get('tags', ['p', 'title', 'h1', 'h2', 'section', 'article'])
157
+ objects = args.get('objects', [])
158
+ source_type = args.get('source_type', self._source_type)
159
+ html_content = self.driver.page_source
160
+ content, md_text, page_title = self.clean_html(
161
+ html_content,
162
+ extract,
163
+ objects
164
+ )
165
+ metadata = {
166
+ "source": url,
167
+ "index": page_title,
168
+ "url": url,
169
+ "filename": page_title,
170
+ "question": '',
171
+ "answer": '',
172
+ "source_type": source_type,
173
+ 'type': 'webpage',
174
+ 'summary': '',
175
+ "document_meta": {
176
+ "language": "en",
177
+ "title": page_title,
178
+ },
179
+ }
180
+ docs = []
181
+ if md_text:
182
+ docs.append(
183
+ Document(
184
+ page_content=md_text,
185
+ metadata=metadata
186
+ )
187
+ )
188
+ # for chunk in self._mark_splitter.split_text(md_text):
189
+ # docs.append(
190
+ # Document(
191
+ # page_content=chunk,
192
+ # metadata=metadata
193
+ # )
194
+ # )
195
+ if content:
196
+ site_content = [
197
+ Document(
198
+ page_content=paragraph,
199
+ metadata=metadata
200
+ ) for paragraph in content
201
+ ]
202
+ return docs + site_content
203
+ except Exception as exc:
204
+ print(f"Failed to load {url}: {exc}")
205
+
206
+ def load(self, **kwargs) -> list:
207
+ documents = []
208
+ for address in self.urls:
209
+ docs = self.get(address)
210
+ if docs:
211
+ documents.extend(docs)
212
+ self.driver.quit()
213
+ return self.split_by_tokens(documents)
214
+
215
+ def parse(self, source):
216
+ pass
@@ -0,0 +1,112 @@
1
+ import asyncio
2
+ import aiohttp
3
+ from markdownify import MarkdownConverter
4
+ from bs4 import BeautifulSoup as bs
5
+ from langchain.text_splitter import MarkdownTextSplitter
6
+ from langchain.docstore.document import Document
7
+ from .abstract import AbstractLoader
8
+
9
+
10
+ class WebBaseLoader(AbstractLoader):
11
+ """Class to load web pages and extract text as Markdown."""
12
+ def __init__(self, urls: dict, source_type: str = 'website', **kwargs):
13
+ self.urls = urls
14
+ self._source_type = source_type
15
+ self.timeout: int = kwargs.pop('timeout', 60)
16
+ self._wait: int = kwargs.pop('wait', 60)
17
+ super().__init__(source_type=source_type, **kwargs)
18
+ self.md_splitter = MarkdownTextSplitter(
19
+ chunk_size = 1024,
20
+ chunk_overlap=10
21
+ )
22
+
23
+ def md(self, soup, **options):
24
+ return MarkdownConverter(**options).convert_soup(soup)
25
+
26
+ async def _fetch(
27
+ self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
28
+ ) -> str:
29
+ async with aiohttp.ClientSession() as session:
30
+ for i in range(retries):
31
+ try:
32
+ async with session.get(url, allow_redirects=True) as response:
33
+ if response.status == 200:
34
+ return await response.text()
35
+ elif response.status == 429:
36
+ # Too many requests
37
+ if i == retries - 1:
38
+ return ''
39
+ self.logger.warning(
40
+ f"Too many requests to {url}. Waiting {self._wait} seconds."
41
+ )
42
+ await asyncio.sleep(self._wait)
43
+ else: # Other non-success status codes
44
+ response.raise_for_status() # Raise for other errors
45
+ except aiohttp.ClientConnectionError as e:
46
+ if i == retries - 1:
47
+ raise
48
+ else:
49
+ self.logger.warning(
50
+ f"Error fetching {url} with attempt "
51
+ f"{i + 1}/{retries}: {e}. Retrying..."
52
+ )
53
+ await asyncio.sleep(cooldown * backoff**i)
54
+ except aiohttp.ClientResponseError as e:
55
+ self.logger.warning(
56
+ f"Request failed (ClientResponseError): {e}"
57
+ )
58
+ return ''
59
+ self.logger.warning(f"Failed to fetch {url} after {retries} attempts.")
60
+ return ''
61
+
62
+ async def get_address(self, url: str) -> list:
63
+ self.logger.info(
64
+ f'Downloading URL {url}'
65
+ )
66
+ html = await self._fetch(url)
67
+ docs = []
68
+ if html:
69
+ soup = bs(html, 'html.parser')
70
+ md_text = self.md(soup)
71
+ try:
72
+ title = soup.title.string
73
+ except AttributeError:
74
+ title = None
75
+ metadata = {
76
+ "url": url,
77
+ "source": url,
78
+ "index": "",
79
+ "filename": '',
80
+ "type": 'webpage',
81
+ "question": '',
82
+ "answer": '',
83
+ "source_type": self._source_type,
84
+ "summary": "",
85
+ "document_meta": {
86
+ "title": title,
87
+ "description": soup.find('meta', attrs={'name': 'description'})['content']
88
+ if soup.find('meta', attrs={'name': 'description'}) else '',
89
+ "keywords": soup.find('meta', attrs={'name': 'keywords'})['content']
90
+ if soup.find('meta', attrs={'name': 'keywords'}) else '',
91
+ }
92
+ }
93
+ for chunk in self.md_splitter.split_text(md_text):
94
+ docs.append(
95
+ Document(
96
+ page_content=chunk,
97
+ metadata=metadata
98
+ )
99
+ )
100
+ return docs
101
+
102
+ async def load(self, **kwargs) -> list:
103
+ documents = []
104
+ if self.urls is None:
105
+ return []
106
+ for address in self.urls:
107
+ docs = await self.get_address(address)
108
+ documents.extend(docs)
109
+ return self.split_documents(documents)
110
+
111
+ def parse(self, source):
112
+ pass
parrot/loaders/word.py ADDED
@@ -0,0 +1,125 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path, PurePath
3
+ from typing import Union
4
+ import mammoth
5
+ import docx
6
+ from markdownify import markdownify as md
7
+ from langchain_community.document_loaders.word_document import (
8
+ UnstructuredWordDocumentLoader
9
+ )
10
+ from langchain.text_splitter import MarkdownTextSplitter
11
+ from langchain.docstore.document import Document
12
+ from .abstract import AbstractLoader
13
+
14
+
15
+ class MSWordLoader(AbstractLoader):
16
+ """
17
+ Loader for Microsoft Word files.
18
+ """
19
+ _extension: list = ['.docx', '.doc']
20
+
21
+ def __init__(
22
+ self,
23
+ path: PurePath,
24
+ tokenizer: Union[str, Callable] = None,
25
+ text_splitter: Union[str, Callable] = None,
26
+ source_type: str = 'document',
27
+ **kwargs
28
+ ):
29
+ self.path = path
30
+ if isinstance(path, str):
31
+ path = Path(path)
32
+ self._use_mammoth: bool = kwargs.pop('use_mammoth', True)
33
+ self._md_splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
34
+ super().__init__(
35
+ tokenizer=tokenizer,
36
+ text_splitter=text_splitter,
37
+ source_type=source_type,
38
+ **kwargs
39
+ )
40
+
41
+ def _load_document(self, path: PurePath) -> list:
42
+ if self._check_path(path):
43
+ docs = []
44
+ self.logger.info(f"Loading Word file: {path}")
45
+ if self._use_mammoth:
46
+ with open(path, "rb") as docx_file:
47
+ doc = docx.Document(str(path))
48
+ prop = doc.core_properties
49
+ result = mammoth.convert_to_html(docx_file)
50
+ # result = mammoth.extract_raw_text(docx_file)
51
+ html = result.value # The generated HTML
52
+ md_text = md(html) # The generated Markdown
53
+ try:
54
+ summary = self.get_summary_from_text(md_text)
55
+ except ValueError:
56
+ summary = ''
57
+ metadata = {
58
+ "url": '',
59
+ "source": path.name,
60
+ "filename": path.name,
61
+ "index": str(path.name),
62
+ "type": 'document',
63
+ "question": '',
64
+ "answer": '',
65
+ "source_type": self._source_type,
66
+ "data": {},
67
+ "summary": summary,
68
+ "document_meta": {
69
+ "author": prop.author,
70
+ "version": prop.version,
71
+ "title": prop.title,
72
+ "created": prop.created.strftime("%Y-%m-%d %H:%M:%S"),
73
+ "last_modified": prop.modified.strftime("%Y-%m-%d %H:%M:%S")
74
+ }
75
+ }
76
+ for chunk in self._md_splitter.split_text(md_text):
77
+ _idx = {
78
+ **metadata
79
+ }
80
+ docs.append(
81
+ Document(
82
+ page_content=chunk,
83
+ metadata=_idx
84
+ )
85
+ )
86
+ return docs
87
+ else:
88
+ word_loader = UnstructuredWordDocumentLoader(
89
+ file_path=str(path)
90
+ )
91
+ docs = word_loader.load()
92
+ for doc in docs:
93
+ # Fix This
94
+ doc.metadata['source_type'] = self._source_type
95
+ return []
96
+
97
+ def load(self) -> list:
98
+ """
99
+ Load data from a DOCX file.
100
+
101
+ Args:
102
+ source (str): The path to the DOCX file.
103
+
104
+ Returns:
105
+ list: A list of Langchain Documents.
106
+ """
107
+ if not self.path.exists():
108
+ raise FileNotFoundError(f"DOCX file/directory not found: {self.path}")
109
+ if self.path.is_dir():
110
+ documents = []
111
+ # iterate over the files in the directory
112
+ for ext in self._extension:
113
+ for item in self.path.glob(f'*{ext}'):
114
+ documents.extend(self._load_document(item))
115
+ elif self.path.is_file():
116
+ documents = self._load_document(self.path)
117
+ else:
118
+ raise ValueError(
119
+ f"DOCX Loader: Invalid path: {self.path}"
120
+ )
121
+ # return documents
122
+ return self.split_documents(documents)
123
+
124
+ def parse(self, source):
125
+ pass