ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/excel.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from typing import Union
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import mimetypes
|
|
6
|
+
import magic
|
|
7
|
+
import xlrd
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas
|
|
10
|
+
from pandas._libs.parsers import STR_NA_VALUES # pylint: disable=E0611
|
|
11
|
+
from langchain.text_splitter import (
|
|
12
|
+
RecursiveCharacterTextSplitter
|
|
13
|
+
)
|
|
14
|
+
from langchain.docstore.document import Document
|
|
15
|
+
from navigator.libs.json import JSONContent
|
|
16
|
+
from .abstract import AbstractLoader
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
excel_based = (
|
|
20
|
+
"application/vnd.ms-excel.sheet.binary.macroEnabled.12",
|
|
21
|
+
"application/vnd.ms-excel.sheet.macroEnabled.12",
|
|
22
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
23
|
+
"application/vnd.ms-excel"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
supported_extensions = (
|
|
27
|
+
".xls",
|
|
28
|
+
".xlsx",
|
|
29
|
+
".xlsm",
|
|
30
|
+
".xlsb"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
class ExcelLoader(AbstractLoader):
|
|
34
|
+
"""
|
|
35
|
+
Loader for Excel files using Pandas (preserving Table structure).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
_extension = supported_extensions
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
path: Union[list[PurePath], PurePath],
|
|
43
|
+
tokenizer: Callable = None,
|
|
44
|
+
text_splitter: Callable = None,
|
|
45
|
+
source_type: str = 'Excel',
|
|
46
|
+
**kwargs
|
|
47
|
+
):
|
|
48
|
+
self.path = path
|
|
49
|
+
# Index Key:
|
|
50
|
+
self._index_keys: list = kwargs.pop('index', [])
|
|
51
|
+
self._type: str = kwargs.pop('document_type', 'Excel')
|
|
52
|
+
self.sheet_name = kwargs.pop('sheet_name', "Sheet 1")
|
|
53
|
+
self.skiprows = kwargs.pop('skiprows', 0)
|
|
54
|
+
self._pdargs = kwargs.pop('pd_args', {})
|
|
55
|
+
self._magic = magic.Magic(mime=True)
|
|
56
|
+
self.mimetype = kwargs.pop('mimetype', None)
|
|
57
|
+
self.filter_nan: bool = kwargs.pop('filter_nan', True)
|
|
58
|
+
self.na_values: list = ["NULL", "TBD"]
|
|
59
|
+
# Operations over dataframe:
|
|
60
|
+
self._drop_empty = kwargs.pop('drop_empty', False)
|
|
61
|
+
self._trim = kwargs.pop('trim', False)
|
|
62
|
+
self._infer_types = kwargs.pop('infer_types', True)
|
|
63
|
+
self._fillna: bool = kwargs.pop('fillna', False)
|
|
64
|
+
self._rename_cols = kwargs.pop('rename_cols', {})
|
|
65
|
+
self._to_integer: list = kwargs.pop('to_integer', [])
|
|
66
|
+
super().__init__(
|
|
67
|
+
tokenizer=tokenizer,
|
|
68
|
+
text_splitter=text_splitter,
|
|
69
|
+
source_type=source_type,
|
|
70
|
+
**kwargs
|
|
71
|
+
)
|
|
72
|
+
# JSON encoder:
|
|
73
|
+
self._encoder = JSONContent()
|
|
74
|
+
|
|
75
|
+
def clean_empty(self, df: pandas.DataFrame, columns: list = None) -> pandas.DataFrame:
|
|
76
|
+
"""
|
|
77
|
+
Clean empty rows and columns from a DataFrame.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
df (pd.DataFrame): The DataFrame to clean.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
pd.DataFrame: The cleaned DataFrame.
|
|
84
|
+
"""
|
|
85
|
+
df.dropna(axis=1, how="all", inplace=True)
|
|
86
|
+
df.dropna(axis=0, how="all", inplace=True)
|
|
87
|
+
if columns:
|
|
88
|
+
for column in columns:
|
|
89
|
+
condition = df[
|
|
90
|
+
(df[column].empty)
|
|
91
|
+
| (df[column] == "")
|
|
92
|
+
| (df[column].isna())
|
|
93
|
+
].index
|
|
94
|
+
df.drop(condition, inplace=True)
|
|
95
|
+
return df
|
|
96
|
+
|
|
97
|
+
def open_excel(self, filename: PurePath) -> pandas.DataFrame:
|
|
98
|
+
self.logger.debug(
|
|
99
|
+
f"Opening Excel file {filename}"
|
|
100
|
+
)
|
|
101
|
+
## Define NA Values:
|
|
102
|
+
default_missing = STR_NA_VALUES.copy()
|
|
103
|
+
for val in self.na_values: # pylint: disable=E0203
|
|
104
|
+
default_missing.add(val)
|
|
105
|
+
default_missing.add(val)
|
|
106
|
+
self.na_values = default_missing
|
|
107
|
+
if filename.suffix not in supported_extensions:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Unsupported Excel file format: {filename.suffix}"
|
|
110
|
+
)
|
|
111
|
+
if not filename.exists():
|
|
112
|
+
raise FileNotFoundError(
|
|
113
|
+
f"Excel file not found: {filename}"
|
|
114
|
+
)
|
|
115
|
+
if not self.mimetype:
|
|
116
|
+
try:
|
|
117
|
+
self.mimetype = self._magic.from_file(str(filename))
|
|
118
|
+
self.logger.debug(f":: Detected MIME IS: {self.mimetype}")
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
self.logger.error(f":: Error detecting MIME: {exc}")
|
|
121
|
+
self.mimetype = mimetypes.guess_type(str(filename))[0]
|
|
122
|
+
if not self.mimetype:
|
|
123
|
+
# Cannot Detect Mime type:
|
|
124
|
+
ext = filename.suffix
|
|
125
|
+
if ext == ".xlsx" or ext == ".xls":
|
|
126
|
+
self.mimetype = "application/vnd.ms-excel"
|
|
127
|
+
elif ext == ".csv" or ext == ".txt":
|
|
128
|
+
self.mimetype = "text/csv"
|
|
129
|
+
if (
|
|
130
|
+
self.mimetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
131
|
+
):
|
|
132
|
+
# xlsx or any openxml based document
|
|
133
|
+
file_engine = "openpyxl"
|
|
134
|
+
elif self.mimetype == "application/vnd.ms-excel.sheet.binary.macroEnabled.12":
|
|
135
|
+
# xlsb
|
|
136
|
+
file_engine = "pyxlsb"
|
|
137
|
+
else:
|
|
138
|
+
# Using extension:
|
|
139
|
+
ext = filename.suffix
|
|
140
|
+
if ext == ".xlsx":
|
|
141
|
+
file_engine = "openpyxl"
|
|
142
|
+
elif ext == ".xls":
|
|
143
|
+
file_engine = "xlrd"
|
|
144
|
+
elif ext == ".xlsb":
|
|
145
|
+
file_engine = "pyxlsb"
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"Unsupported Excel file format: {filename.suffix}"
|
|
149
|
+
)
|
|
150
|
+
# Build Arguments:
|
|
151
|
+
pd_args = self._pdargs.copy()
|
|
152
|
+
pd_args['sheet_name'] = self.sheet_name
|
|
153
|
+
pd_args['skiprows'] = self.skiprows
|
|
154
|
+
pd_args['engine'] = file_engine
|
|
155
|
+
pd_args['na_values'] = self.na_values
|
|
156
|
+
pd_args['na_filter'] = self.filter_nan
|
|
157
|
+
try:
|
|
158
|
+
df = pandas.read_excel(
|
|
159
|
+
filename,
|
|
160
|
+
keep_default_na=False,
|
|
161
|
+
**pd_args
|
|
162
|
+
)
|
|
163
|
+
except (IndexError, xlrd.biffh.XLRDError) as err:
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"Excel Index error on File {filename}: {err}"
|
|
166
|
+
) from err
|
|
167
|
+
except pandas.errors.EmptyDataError as err:
|
|
168
|
+
raise ValueError(f"Empty File {filename}: {err}") from err
|
|
169
|
+
except pandas.errors.ParserError as err:
|
|
170
|
+
raise ValueError(f"Error Parsing File {filename}: {err}") from err
|
|
171
|
+
except Exception as err:
|
|
172
|
+
self.logger.exception(str(err), stack_info=True)
|
|
173
|
+
raise
|
|
174
|
+
# Post-Processing the DataFrame:
|
|
175
|
+
if self._infer_types is True:
|
|
176
|
+
df.infer_objects()
|
|
177
|
+
# rename cols:
|
|
178
|
+
if self._rename_cols:
|
|
179
|
+
try:
|
|
180
|
+
# Renaming Pandas Columns:
|
|
181
|
+
df.rename(columns=self._rename_cols, inplace=True)
|
|
182
|
+
except Exception as err:
|
|
183
|
+
self.logger.error(
|
|
184
|
+
f"Error Renaming Columns: {err}"
|
|
185
|
+
)
|
|
186
|
+
# Clean Empty Rows:
|
|
187
|
+
df = self.clean_empty(df, self._index_keys)
|
|
188
|
+
if self._drop_empty:
|
|
189
|
+
df.dropna(axis=1, how="all", inplace=True)
|
|
190
|
+
df.dropna(axis=0, how="all", inplace=True)
|
|
191
|
+
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
|
|
192
|
+
if self._trim:
|
|
193
|
+
cols = df.select_dtypes(include=["object", "string"])
|
|
194
|
+
for col in cols:
|
|
195
|
+
df[col] = df[col].astype(str).str.strip()
|
|
196
|
+
# Trim whitespace from all string columns
|
|
197
|
+
# df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
|
|
198
|
+
if self._to_integer:
|
|
199
|
+
for column in self._to_integer:
|
|
200
|
+
df[column] = df[column].fillna(0)
|
|
201
|
+
try:
|
|
202
|
+
df[column] = pandas.to_numeric(df[column], errors="coerce")
|
|
203
|
+
df[column] = df[column].astype('Int64')
|
|
204
|
+
except Exception:
|
|
205
|
+
continue
|
|
206
|
+
if self._fillna:
|
|
207
|
+
# Select all integer columns (both nullable and non-nullable)
|
|
208
|
+
int_columns = df.select_dtypes(
|
|
209
|
+
include=['int64', 'Int64', 'int32', 'Int32']
|
|
210
|
+
).columns # Add other integer types if necessary
|
|
211
|
+
# Fill NaN values with zeros in those columns
|
|
212
|
+
df[int_columns] = df[int_columns].fillna(0)
|
|
213
|
+
# Select the Strings:
|
|
214
|
+
str_columns = df.select_dtypes(include=["object", "string"]).columns
|
|
215
|
+
df[str_columns] = df[str_columns].astype(str).replace(["nan", np.nan], "", regex=True)
|
|
216
|
+
print(df)
|
|
217
|
+
print("::: Printing Column Information === ")
|
|
218
|
+
for column, t in df.dtypes.items():
|
|
219
|
+
print(column, "->", t, "->", df[column].iloc[0])
|
|
220
|
+
return df
|
|
221
|
+
|
|
222
|
+
def unique_columns(self, df: pandas.DataFrame) -> pandas.DataFrame:
|
|
223
|
+
"""
|
|
224
|
+
Rename duplicate columns in the DataFrame to ensure they are unique.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
df (pd.DataFrame): The DataFrame with potential duplicate column names.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
pd.DataFrame: A DataFrame with unique column names.
|
|
231
|
+
"""
|
|
232
|
+
seen = {}
|
|
233
|
+
new_columns = []
|
|
234
|
+
for col in df.columns:
|
|
235
|
+
new_col = col
|
|
236
|
+
count = seen.get(col, 0)
|
|
237
|
+
while new_col in new_columns:
|
|
238
|
+
count += 1
|
|
239
|
+
new_col = f"{col}_{count}"
|
|
240
|
+
new_columns.append(new_col)
|
|
241
|
+
seen[col] = count
|
|
242
|
+
df.columns = new_columns
|
|
243
|
+
return df
|
|
244
|
+
|
|
245
|
+
def get_json(self, df: pandas.DataFrame) -> str:
|
|
246
|
+
"""
|
|
247
|
+
Convert a DataFrame to a JSON string.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
df (pd.DataFrame): The DataFrame to convert.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
str: The JSON string.
|
|
254
|
+
"""
|
|
255
|
+
buffer = StringIO()
|
|
256
|
+
df = self.unique_columns(df)
|
|
257
|
+
df.to_json(buffer, orient='records')
|
|
258
|
+
buffer.seek(0)
|
|
259
|
+
return buffer.getvalue()
|
|
260
|
+
|
|
261
|
+
def row_to_json(self, data) -> str:
|
|
262
|
+
return self._encoder.dumps(data)
|
|
263
|
+
|
|
264
|
+
def row_to_string(self, data: dict) -> str:
|
|
265
|
+
results = []
|
|
266
|
+
for key, val in data.items():
|
|
267
|
+
results.append(f'{key}: "{val!s}"')
|
|
268
|
+
return ', '.join(results)
|
|
269
|
+
|
|
270
|
+
def _load_excel(self, path: PurePath) -> list:
|
|
271
|
+
"""
|
|
272
|
+
Load an Excel file using the Pandas library.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
path (Path): The path to the Excel file.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
list: A list of Langchain Documents.
|
|
279
|
+
"""
|
|
280
|
+
if self._check_path(path):
|
|
281
|
+
self.logger.info(f"Loading Excel file: {path}")
|
|
282
|
+
df = self.open_excel(path)
|
|
283
|
+
# Check for row unicity:
|
|
284
|
+
df = self.unique_columns(df)
|
|
285
|
+
metadata = {
|
|
286
|
+
"url": '',
|
|
287
|
+
"index": '',
|
|
288
|
+
"source": str(path.name),
|
|
289
|
+
"filename": path.name,
|
|
290
|
+
"question": '',
|
|
291
|
+
"answer": '',
|
|
292
|
+
"summary": '',
|
|
293
|
+
"source_type": self._source_type,
|
|
294
|
+
"type": self._type,
|
|
295
|
+
}
|
|
296
|
+
document_meta = {
|
|
297
|
+
"columns": df.columns.tolist(),
|
|
298
|
+
"rows": str(len(df)),
|
|
299
|
+
"sheet": self.sheet_name,
|
|
300
|
+
}
|
|
301
|
+
documents = []
|
|
302
|
+
# remove NaN values:
|
|
303
|
+
df.fillna('', axis=1, inplace=True)
|
|
304
|
+
for idx, row in df.iterrows():
|
|
305
|
+
idk = ''
|
|
306
|
+
rw = row.to_dict()
|
|
307
|
+
for col in self._index_keys:
|
|
308
|
+
if col in row:
|
|
309
|
+
idk = row[col]
|
|
310
|
+
break
|
|
311
|
+
_data = {
|
|
312
|
+
"index": idk,
|
|
313
|
+
"data": rw,
|
|
314
|
+
"document_meta": {
|
|
315
|
+
"row_index": idx,
|
|
316
|
+
**document_meta,
|
|
317
|
+
**rw
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
_meta = {**metadata, **_data}
|
|
321
|
+
row_data = self.row_to_string(rw)
|
|
322
|
+
doc = Document(
|
|
323
|
+
page_content=row_data,
|
|
324
|
+
metadata=_meta
|
|
325
|
+
)
|
|
326
|
+
documents.append(doc)
|
|
327
|
+
return documents
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
def load(self, max_tokens: int = 768) -> list:
|
|
331
|
+
documents = []
|
|
332
|
+
if self.path.is_file():
|
|
333
|
+
# single File:
|
|
334
|
+
documents = self._load_excel(self.path)
|
|
335
|
+
elif self.path.is_dir():
|
|
336
|
+
documents = []
|
|
337
|
+
# iterate over the files in the directory
|
|
338
|
+
for ext in self._extension:
|
|
339
|
+
for item in self.path.glob(f'*{ext}'):
|
|
340
|
+
documents.extend(self._load_excel(item))
|
|
341
|
+
elif isinstance(self.path, list):
|
|
342
|
+
pass
|
|
343
|
+
# Split Table Data:
|
|
344
|
+
return self.split_documents(documents)
|
|
345
|
+
|
|
346
|
+
def parse(self, source):
|
|
347
|
+
raise NotImplementedError(
|
|
348
|
+
"Parser method is not implemented for ExcelLoader."
|
|
349
|
+
)
|
parrot/loaders/github.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from langchain_community.document_loaders.parsers import LanguageParser
|
|
3
|
+
from langchain_community.document_loaders import GithubFileLoader
|
|
4
|
+
from langchain_text_splitters import Language
|
|
5
|
+
from langchain.text_splitter import (
|
|
6
|
+
RecursiveCharacterTextSplitter
|
|
7
|
+
)
|
|
8
|
+
from navconfig import config
|
|
9
|
+
from .abstract import AbstractLoader
|
|
10
|
+
|
|
11
|
+
class GithubLoader(AbstractLoader):
|
|
12
|
+
"""
|
|
13
|
+
Load Code from a Github Repository.
|
|
14
|
+
"""
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
repository_url: str,
|
|
18
|
+
lang: str = 'python',
|
|
19
|
+
source_type: str = 'code',
|
|
20
|
+
branch: str = 'main',
|
|
21
|
+
**kwargs
|
|
22
|
+
):
|
|
23
|
+
super().__init__(source_type=source_type, **kwargs)
|
|
24
|
+
self._url = repository_url
|
|
25
|
+
self.branch = branch
|
|
26
|
+
self.github_token = kwargs.get('github_token', config.get('GITHUB_TOKEN'))
|
|
27
|
+
self.lang = lang
|
|
28
|
+
if lang == 'python':
|
|
29
|
+
self.parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
|
|
30
|
+
self.splitter = RecursiveCharacterTextSplitter.from_language(
|
|
31
|
+
language=Language.PYTHON, chunk_size=1024, chunk_overlap=200
|
|
32
|
+
)
|
|
33
|
+
self.suffixes = [".py", ".pyx"]
|
|
34
|
+
elif lang == 'javascript':
|
|
35
|
+
self.parser = LanguageParser(language=Language.JS, parser_threshold=500)
|
|
36
|
+
self.splitter = RecursiveCharacterTextSplitter.from_language(
|
|
37
|
+
language=Language.JS, chunk_size=1024, chunk_overlap=200
|
|
38
|
+
)
|
|
39
|
+
self.suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
40
|
+
elif lang == 'typescript':
|
|
41
|
+
self.parser = LanguageParser(language=Language.TS, parser_threshold=500)
|
|
42
|
+
self.splitter = RecursiveCharacterTextSplitter.from_language(
|
|
43
|
+
language=Language.TS, chunk_size=1024, chunk_overlap=200
|
|
44
|
+
)
|
|
45
|
+
self.suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Language {lang} not supported for Repository"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def load(self) -> list:
|
|
52
|
+
self.logger.info(f'Loading Github Repository > {self._url}:<{self.branch}>')
|
|
53
|
+
loader = GithubFileLoader(
|
|
54
|
+
repo=self._url,
|
|
55
|
+
github_api_url='https://api.github.com',
|
|
56
|
+
access_token=self.github_token,
|
|
57
|
+
branch=self.branch,
|
|
58
|
+
)
|
|
59
|
+
docs = loader.load()
|
|
60
|
+
for doc in docs:
|
|
61
|
+
doc.metadata['source_type'] = self._source_type
|
|
62
|
+
return self.split_documents(docs)
|
|
63
|
+
|
|
64
|
+
def parse(self, source):
|
|
65
|
+
raise NotImplementedError("Parser method is not implemented for PDFLoader.")
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Management for Bots.
|
|
3
|
+
"""
|
|
4
|
+
from asyncdb.exceptions import NoDataFound, DriverError
|
|
5
|
+
from querysource.datasources.drivers.rethink import rethink_default
|
|
6
|
+
from navigator.views import FormModel
|
|
7
|
+
from ..utils.models import BotData
|
|
8
|
+
from ...interfaces import DBInterface
|
|
9
|
+
|
|
10
|
+
class DataManagement(DBInterface, FormModel):
|
|
11
|
+
"""
|
|
12
|
+
Managing Data to be inserted into Vector Databases.
|
|
13
|
+
"""
|
|
14
|
+
model = BotData
|
|
15
|
+
path: str = '/api/v1/bot_management/data'
|
|
16
|
+
|
|
17
|
+
async def get_rethink(self):
|
|
18
|
+
params = rethink_default.params()
|
|
19
|
+
return self.get_database('rethink', params=params)
|
|
20
|
+
|
|
21
|
+
async def _get_form(self, args, qp, fields):
|
|
22
|
+
"""Get Form information."""
|
|
23
|
+
bot = args.get('id', None)
|
|
24
|
+
async with await self.get_rethink() as conn:
|
|
25
|
+
await conn.use('navigator')
|
|
26
|
+
filter = {}
|
|
27
|
+
if bot:
|
|
28
|
+
filter= {
|
|
29
|
+
"chatbot_id": bot
|
|
30
|
+
}
|
|
31
|
+
try:
|
|
32
|
+
result = await conn.fetch_all(
|
|
33
|
+
'chatbots_data',
|
|
34
|
+
filter=filter
|
|
35
|
+
)
|
|
36
|
+
except NoDataFound:
|
|
37
|
+
return self.json_response(
|
|
38
|
+
status=204
|
|
39
|
+
)
|
|
40
|
+
return self.json_response(
|
|
41
|
+
result, status=200
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
async def put(self):
|
|
45
|
+
"""
|
|
46
|
+
Creating a new data-store record.
|
|
47
|
+
"""
|
|
48
|
+
data = await self.validate_payload()
|
|
49
|
+
args = self.get_arguments()
|
|
50
|
+
# unique field is ['chatbot_id', 'source_type', 'version']
|
|
51
|
+
bot = args.get('id', data.chatbot_id)
|
|
52
|
+
filter = {
|
|
53
|
+
'chatbot_id': str(bot),
|
|
54
|
+
'source_type': data.source_type,
|
|
55
|
+
'version': data.version
|
|
56
|
+
}
|
|
57
|
+
tbl = data.Meta.name
|
|
58
|
+
exists = False
|
|
59
|
+
async with await self.get_rethink() as conn:
|
|
60
|
+
await conn.use('navigator')
|
|
61
|
+
try:
|
|
62
|
+
result = await conn.fetch_one(
|
|
63
|
+
table=tbl,
|
|
64
|
+
filter=filter
|
|
65
|
+
)
|
|
66
|
+
if result:
|
|
67
|
+
exists = True
|
|
68
|
+
except NoDataFound:
|
|
69
|
+
exists = False
|
|
70
|
+
if exists is False:
|
|
71
|
+
try:
|
|
72
|
+
state = await conn.insert(
|
|
73
|
+
tbl, data.to_dict(), on_conflict='replace'
|
|
74
|
+
)
|
|
75
|
+
return self.json_response(
|
|
76
|
+
response={
|
|
77
|
+
"message": "Bot data inserted",
|
|
78
|
+
"state": state,
|
|
79
|
+
},
|
|
80
|
+
status=203
|
|
81
|
+
)
|
|
82
|
+
except DriverError as exc:
|
|
83
|
+
return self.error(
|
|
84
|
+
response={
|
|
85
|
+
"error": "unable to insert RT data",
|
|
86
|
+
"exception": str(exc)
|
|
87
|
+
},
|
|
88
|
+
status=400
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
data.version += data.version
|
|
92
|
+
try:
|
|
93
|
+
state = await conn.update(
|
|
94
|
+
tbl, data.to_dict(), filter=filter, return_changes=True
|
|
95
|
+
)
|
|
96
|
+
return self.json_response(
|
|
97
|
+
response={
|
|
98
|
+
"message": "Bot data updated",
|
|
99
|
+
"state": state,
|
|
100
|
+
},
|
|
101
|
+
status=202
|
|
102
|
+
)
|
|
103
|
+
except DriverError as exc:
|
|
104
|
+
return self.error(
|
|
105
|
+
response={
|
|
106
|
+
"error": "unable to insert RT data",
|
|
107
|
+
"exception": str(exc)
|
|
108
|
+
},
|
|
109
|
+
status=400
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
async def patch(self):
|
|
113
|
+
"""
|
|
114
|
+
Patching (updating) existing data.
|
|
115
|
+
"""
|
|
116
|
+
# data = await self.validate_payload()
|
|
117
|
+
data = await self.json_data()
|
|
118
|
+
args = self.get_arguments()
|
|
119
|
+
# unique field is ['chatbot_id', 'source_type', 'version']
|
|
120
|
+
bot = args.get('id', data['chatbot_id'])
|
|
121
|
+
try:
|
|
122
|
+
filter = {
|
|
123
|
+
'chatbot_id': str(bot),
|
|
124
|
+
'source_type': data['source_type'],
|
|
125
|
+
'version': data['version']
|
|
126
|
+
}
|
|
127
|
+
except KeyError:
|
|
128
|
+
return self.error(
|
|
129
|
+
response={
|
|
130
|
+
"error": "Invalid data for Data Filtering"
|
|
131
|
+
},
|
|
132
|
+
status=400
|
|
133
|
+
)
|
|
134
|
+
tbl = 'chatbots_data'
|
|
135
|
+
async with await self.get_rethink() as conn:
|
|
136
|
+
await conn.use('navigator')
|
|
137
|
+
try:
|
|
138
|
+
result = await conn.fetch_one(
|
|
139
|
+
table=tbl,
|
|
140
|
+
filter=filter
|
|
141
|
+
)
|
|
142
|
+
if result:
|
|
143
|
+
# update existing:
|
|
144
|
+
for k,v in data.items():
|
|
145
|
+
result[k] = v
|
|
146
|
+
try:
|
|
147
|
+
state = await conn.update(
|
|
148
|
+
tbl, result, filter=filter, return_changes=True
|
|
149
|
+
)
|
|
150
|
+
return self.json_response(
|
|
151
|
+
response={
|
|
152
|
+
"message": "Bot data updated",
|
|
153
|
+
"state": state,
|
|
154
|
+
},
|
|
155
|
+
status=202
|
|
156
|
+
)
|
|
157
|
+
except DriverError as exc:
|
|
158
|
+
return self.error(
|
|
159
|
+
response={
|
|
160
|
+
"error": "unable to Update RT data",
|
|
161
|
+
"exception": str(exc)
|
|
162
|
+
},
|
|
163
|
+
status=400
|
|
164
|
+
)
|
|
165
|
+
except NoDataFound:
|
|
166
|
+
return self.error(
|
|
167
|
+
response={
|
|
168
|
+
"error": f"Data not Found for {filter}",
|
|
169
|
+
},
|
|
170
|
+
status=400
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
async def delete(self):
|
|
174
|
+
"""
|
|
175
|
+
Remove a data-store record
|
|
176
|
+
"""
|
|
177
|
+
data = await self.validate_payload()
|
|
178
|
+
args = self.get_arguments()
|
|
179
|
+
# unique field is ['chatbot_id', 'source_type', 'version']
|
|
180
|
+
bot = args.get('id', data.chatbot_id)
|
|
181
|
+
filter = {
|
|
182
|
+
'chatbot_id': str(bot),
|
|
183
|
+
'source_type': data.source_type,
|
|
184
|
+
'version': data.version
|
|
185
|
+
}
|
|
186
|
+
tbl = data.Meta.name
|
|
187
|
+
async with await self.get_rethink() as conn:
|
|
188
|
+
await conn.use('navigator')
|
|
189
|
+
try:
|
|
190
|
+
result = await conn.fetch_one(
|
|
191
|
+
table=tbl,
|
|
192
|
+
filter=filter
|
|
193
|
+
)
|
|
194
|
+
if result:
|
|
195
|
+
state = await conn.delete(
|
|
196
|
+
table=tbl,
|
|
197
|
+
filter=filter,
|
|
198
|
+
return_changes=True
|
|
199
|
+
)
|
|
200
|
+
return self.json_response(
|
|
201
|
+
response={
|
|
202
|
+
"message": "Bot data deleted",
|
|
203
|
+
"state": state,
|
|
204
|
+
},
|
|
205
|
+
status=202
|
|
206
|
+
)
|
|
207
|
+
except NoDataFound:
|
|
208
|
+
return self.error(
|
|
209
|
+
response={
|
|
210
|
+
"error": f"Data not Found for {filter}",
|
|
211
|
+
},
|
|
212
|
+
status=400
|
|
213
|
+
)
|