ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (108) hide show
  1. ai_parrot-0.1.0.dist-info/LICENSE +21 -0
  2. ai_parrot-0.1.0.dist-info/METADATA +299 -0
  3. ai_parrot-0.1.0.dist-info/RECORD +108 -0
  4. ai_parrot-0.1.0.dist-info/WHEEL +5 -0
  5. ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +18 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +965 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +257 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +100 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/oddie.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +515 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +108 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +169 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +0 -0
  40. parrot/llms/abstract.py +41 -0
  41. parrot/llms/anthropic.py +36 -0
  42. parrot/llms/google.py +37 -0
  43. parrot/llms/groq.py +33 -0
  44. parrot/llms/hf.py +39 -0
  45. parrot/llms/openai.py +49 -0
  46. parrot/llms/pipes.py +103 -0
  47. parrot/llms/vertex.py +68 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/basepdf.py +102 -0
  51. parrot/loaders/basevideo.py +280 -0
  52. parrot/loaders/csv.py +42 -0
  53. parrot/loaders/dir.py +37 -0
  54. parrot/loaders/excel.py +349 -0
  55. parrot/loaders/github.py +65 -0
  56. parrot/loaders/handlers/__init__.py +5 -0
  57. parrot/loaders/handlers/data.py +213 -0
  58. parrot/loaders/image.py +119 -0
  59. parrot/loaders/json.py +52 -0
  60. parrot/loaders/pdf.py +187 -0
  61. parrot/loaders/pdfchapters.py +142 -0
  62. parrot/loaders/pdffn.py +112 -0
  63. parrot/loaders/pdfimages.py +207 -0
  64. parrot/loaders/pdfmark.py +88 -0
  65. parrot/loaders/pdftables.py +145 -0
  66. parrot/loaders/ppt.py +30 -0
  67. parrot/loaders/qa.py +81 -0
  68. parrot/loaders/repo.py +103 -0
  69. parrot/loaders/rtd.py +65 -0
  70. parrot/loaders/txt.py +92 -0
  71. parrot/loaders/utils/__init__.py +1 -0
  72. parrot/loaders/utils/models.py +25 -0
  73. parrot/loaders/video.py +96 -0
  74. parrot/loaders/videolocal.py +107 -0
  75. parrot/loaders/vimeo.py +106 -0
  76. parrot/loaders/web.py +216 -0
  77. parrot/loaders/web_base.py +112 -0
  78. parrot/loaders/word.py +125 -0
  79. parrot/loaders/youtube.py +192 -0
  80. parrot/manager.py +152 -0
  81. parrot/models.py +347 -0
  82. parrot/py.typed +0 -0
  83. parrot/stores/__init__.py +0 -0
  84. parrot/stores/abstract.py +170 -0
  85. parrot/stores/milvus.py +540 -0
  86. parrot/stores/qdrant.py +153 -0
  87. parrot/tools/__init__.py +16 -0
  88. parrot/tools/abstract.py +53 -0
  89. parrot/tools/asknews.py +32 -0
  90. parrot/tools/bing.py +13 -0
  91. parrot/tools/duck.py +62 -0
  92. parrot/tools/google.py +170 -0
  93. parrot/tools/stack.py +26 -0
  94. parrot/tools/weather.py +70 -0
  95. parrot/tools/wikipedia.py +59 -0
  96. parrot/tools/zipcode.py +179 -0
  97. parrot/utils/__init__.py +2 -0
  98. parrot/utils/parsers/__init__.py +5 -0
  99. parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
  100. parrot/utils/toml.py +11 -0
  101. parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
  102. parrot/utils/uv.py +11 -0
  103. parrot/version.py +10 -0
  104. resources/users/__init__.py +5 -0
  105. resources/users/handlers.py +13 -0
  106. resources/users/models.py +205 -0
  107. settings/__init__.py +0 -0
  108. settings/settings.py +51 -0
@@ -0,0 +1,207 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ import fitz
5
+ from pdf4llm import to_markdown
6
+ from PIL import Image
7
+ from langchain.docstore.document import Document
8
+ from langchain.text_splitter import MarkdownTextSplitter
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoProcessor,
12
+ LlavaForConditionalGeneration,
13
+ pipeline,
14
+ BitsAndBytesConfig
15
+ )
16
+ import torch
17
+ from .basepdf import BasePDF
18
+
19
+
20
+ quantization_config = BitsAndBytesConfig(
21
+ load_in_4bit=True,
22
+ bnb_4bit_compute_dtype=torch.float16
23
+ )
24
+
25
+
26
+ class PDFImageLoader(BasePDF):
27
+ """
28
+ Loader for PDF files.
29
+ """
30
+ default_prompt: str = "<|user|>\n<image>\nExplain this schematic diagram or technical installation instructions and wire diagrams, please be detailed about descriptions of steps:<|end|>\n<|assistant|>\n"
31
+ def __init__(
32
+ self,
33
+ path: PurePath,
34
+ tokenizer: Callable[..., Any] = None,
35
+ text_splitter: Callable[..., Any] = None,
36
+ source_type: str = 'pdf',
37
+ language: str = "eng",
38
+ **kwargs
39
+ ):
40
+ super().__init__(
41
+ path=path,
42
+ tokenizer=tokenizer,
43
+ text_splitter=text_splitter,
44
+ source_type=source_type,
45
+ language=language,
46
+ **kwargs
47
+ )
48
+ self._image_model = kwargs.get('image_model', 'llava-hf/llava-v1.6-vicuna-7b-hf')
49
+ self._task = kwargs.get('task', 'image-to-text')
50
+ self._max_tokens = kwargs.get('max_tokens', 600)
51
+ # Loading the model with low CPU memory usage
52
+ # model = LlavaForConditionalGeneration.from_pretrained(
53
+ # self._image_model,
54
+ # quantization_config=quantization_config,
55
+ # device_map="auto",
56
+ # torch_dtype=torch.float16,
57
+ # low_cpu_mem_usage=True
58
+ # )
59
+ # # Load the processor
60
+ # processor = AutoProcessor.from_pretrained(self._image_model, use_fast=True)
61
+ self._pipeline = pipeline(
62
+ self._task,
63
+ model=self._image_model,
64
+ # tokenizer=processor.tokenizer,
65
+ # image_processor=processor.image_processor,
66
+ model_kwargs={"quantization_config": quantization_config},
67
+ # device=self._device,
68
+ max_new_tokens=self._max_tokens,
69
+ # low_cpu_mem_usage=True,
70
+ use_fast=True
71
+ )
72
+ # default prompt
73
+ self._prompt = kwargs.get('prompt', self.default_prompt)
74
+ # Markdown Splitter
75
+ self._splitter = MarkdownTextSplitter(
76
+ chunk_size = self._chunk_size,
77
+ chunk_overlap=10
78
+ )
79
+
80
+ def pixmap_to_pil_image(self, pix):
81
+ """Converts a PyMuPDF Pixmap object to a PIL Image"""
82
+ return Image.frombytes(
83
+ "RGB",
84
+ [pix.width, pix.height],
85
+ pix.samples
86
+ )
87
+
88
+ def _load_pdf(self, path: Path) -> list:
89
+ """
90
+ Load a PDF file as Images.
91
+
92
+ Args:
93
+ path (Path): The path to the PDF file.
94
+
95
+ Returns:
96
+ list: A list of Langchain Documents.
97
+ """
98
+ if self._check_path(path):
99
+ self.logger.info(f"Loading PDF file: {path}")
100
+ pdf = fitz.open(str(path)) # Open the PDF file
101
+ docs = []
102
+ try:
103
+ # get markdown for all pages and saved separately
104
+ md_text = to_markdown(pdf)
105
+ try:
106
+ summary = self.get_summary_from_text(md_text)
107
+ except Exception:
108
+ summary = ''
109
+ metadata = {
110
+ "url": '',
111
+ "idx": str(path.name),
112
+ "filename": str(path.name),
113
+ "source": str(path.name),
114
+ "type": 'pdf',
115
+ "question": '',
116
+ "answer": '',
117
+ "data": {},
118
+ "summary": summary,
119
+ "source_type": self._source_type,
120
+ "document_meta": {
121
+ "title": pdf.metadata.get("title", ""),
122
+ "author": pdf.metadata.get("author", ""),
123
+ }
124
+ }
125
+ for idx, chunk in enumerate(self._splitter.split_text(md_text)):
126
+ _info = {
127
+ "index": f"{idx}",
128
+ **metadata
129
+ }
130
+ docs.append(
131
+ Document(
132
+ page_content=chunk,
133
+ metadata=_info
134
+ )
135
+ )
136
+ except (IndexError, ValueError) as exc:
137
+ self.logger.warning(
138
+ f"There is no text data to load on {path.name}: {exc}"
139
+ )
140
+ # Then, processing the pages one by one as Images:
141
+ file_name = path.stem.replace(" ", "_").replace(".", "_")
142
+ for page_number in range(pdf.page_count):
143
+ page_num = page_number + 1
144
+ self.logger.notice(
145
+ f"Processing PDF {path} on Page {page_num}"
146
+ )
147
+ page = pdf[page_number]
148
+ pix = page.get_pixmap(colorspace=fitz.csRGB, alpha=False)
149
+ zoom_x = 2.0 # horizontal zoom
150
+ zoom_y = 2.0 # vertical zoom
151
+ mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension
152
+ img_stream = self.pixmap_to_pil_image(pix)
153
+ url = ''
154
+ img_name = f'image_{file_name}_{page_num}.png'
155
+ if self.save_images is True:
156
+ img_path = self.save_image(
157
+ img_stream,
158
+ img_name,
159
+ self._imgdir
160
+ )
161
+ url = f'/static/images/{img_name}'
162
+ # extracting features and explanations:
163
+ outputs = self._pipeline(
164
+ img_stream,
165
+ prompt=self._prompt,
166
+ generate_kwargs={"max_new_tokens": self._max_tokens}
167
+ )
168
+ documents = []
169
+ for idx, output in enumerate(outputs):
170
+ generated_text = output['generated_text']
171
+ # Split using the special tokens, if available
172
+ split_text = generated_text.split("<|assistant|>")
173
+ prompt_text = split_text[0].replace("<|prompt|>", "").strip() if "<|prompt|>" in generated_text else ""
174
+ response_text = split_text[1].strip() if len(split_text) > 1 else ""
175
+ # Attach the image using Markdown syntax
176
+ image_markdown = f"\n\n![Image]({url})\n"
177
+ response_text += image_markdown
178
+ _meta = {
179
+ "url": f"{url}",
180
+ "filename": str(path.name),
181
+ "index": f"Page {page_num}, part: {idx}",
182
+ "source": str(path.name),
183
+ "type": 'pdf',
184
+ "question": prompt_text,
185
+ "answer": '',
186
+ "data": {},
187
+ "summary": '',
188
+ "source_type": self._source_type,
189
+ "document_meta": {
190
+ "page": f"Page {page}",
191
+ "image": f"{img_name}",
192
+ "url": f"{url}"
193
+ }
194
+ }
195
+ documents.append(
196
+ Document(
197
+ page_content=response_text,
198
+ metadata=_meta
199
+ )
200
+ )
201
+ return docs + documents
202
+
203
+ def load(self) -> list:
204
+ try:
205
+ return super().load()
206
+ finally:
207
+ self._pipeline = None
@@ -0,0 +1,88 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ import fitz
5
+ from pdf4llm import to_markdown
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import MarkdownTextSplitter
8
+ from .basepdf import BasePDF
9
+
10
+ class PDFMarkdownLoader(BasePDF):
11
+ """
12
+ Loader for PDF files converted content to markdown.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'pdf',
21
+ language: str = "eng",
22
+ **kwargs
23
+ ):
24
+ super().__init__(
25
+ path=path,
26
+ tokenizer=tokenizer,
27
+ text_splitter=text_splitter,
28
+ source_type=source_type,
29
+ language=language,
30
+ **kwargs
31
+ )
32
+ self._splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
33
+
34
+ def _load_pdf(self, path: Path) -> list:
35
+ """
36
+ Load a PDF file using the PDFMiner library.
37
+
38
+ Args:
39
+ path (Path): The path to the PDF file.
40
+
41
+ Returns:
42
+ list: A list of Langchain Documents.
43
+ """
44
+ if self._check_path(path):
45
+ self.logger.info(f"Loading PDF file: {path}")
46
+ docs = []
47
+ pdf = fitz.open(str(path))
48
+ md_text = to_markdown(pdf) # get markdown for all pages
49
+ try:
50
+ summary = self.get_summary_from_text(md_text)
51
+ except Exception:
52
+ summary = ''
53
+ metadata = {
54
+ "url": '',
55
+ "filename": path.name,
56
+ # "index": f"{path.name}",
57
+ "source": str(path.name),
58
+ "type": 'pdf',
59
+ "question": '',
60
+ "answer": '',
61
+ "data": {},
62
+ "summary": summary,
63
+ "source_type": self._source_type,
64
+ "document_meta": {
65
+ "title": pdf.metadata.get("title", ""),
66
+ # "subject": pdf.metadata.get("subject", ""),
67
+ # "keywords": pdf.metadata.get("keywords", ""),
68
+ "creationDate": pdf.metadata.get("creationDate", ""),
69
+ # "modDate": pdf.metadata.get("modDate", ""),
70
+ # "producer": pdf.metadata.get("producer", ""),
71
+ # "creator": pdf.metadata.get("creator", ""),
72
+ "author": pdf.metadata.get("author", ""),
73
+ }
74
+ }
75
+ for idx, chunk in enumerate(self._splitter.split_text(md_text)):
76
+ _info = {
77
+ "index": f"{idx}",
78
+ **metadata
79
+ }
80
+ docs.append(
81
+ Document(
82
+ page_content=chunk,
83
+ metadata=_info
84
+ )
85
+ )
86
+ return docs
87
+ else:
88
+ return []
@@ -0,0 +1,145 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List
3
+ from pathlib import Path, PurePath
4
+ from io import StringIO
5
+ import fitz
6
+ import pandas as pd
7
+ from langchain.docstore.document import Document
8
+ from .basepdf import BasePDF
9
+
10
+
11
+ class PDFTablesLoader(BasePDF):
12
+ """
13
+ Loader for Tables in PDF Files.
14
+ """
15
+ _extension = ['.pdf']
16
+
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ table_settings: dict = {},
25
+ **kwargs
26
+ ):
27
+ super().__init__(
28
+ path,
29
+ tokenizer,
30
+ text_splitter,
31
+ source_type,
32
+ language=language,
33
+ **kwargs
34
+ )
35
+ # Table Settings:
36
+ self.table_settings = {
37
+ #"vertical_strategy": "text",
38
+ # "horizontal_strategy": "text",
39
+ "intersection_x_tolerance": 5,
40
+ "intersection_y_tolerance": 5
41
+ }
42
+ if table_settings:
43
+ self.table_settings.update(table_settings)
44
+ self._skiprows = kwargs.pop('skiprows', None)
45
+
46
+ def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
47
+ """
48
+ Rename duplicate columns in the DataFrame to ensure they are unique.
49
+
50
+ Args:
51
+ df (pd.DataFrame): The DataFrame with potential duplicate column names.
52
+
53
+ Returns:
54
+ pd.DataFrame: A DataFrame with unique column names.
55
+ """
56
+ seen = {}
57
+ new_columns = []
58
+ for col in df.columns:
59
+ new_col = col
60
+ count = seen.get(col, 0)
61
+ while new_col in new_columns:
62
+ count += 1
63
+ new_col = f"{col}_{count}"
64
+ new_columns.append(new_col)
65
+ seen[col] = count
66
+ df.columns = new_columns
67
+ return df
68
+
69
+ def get_markdown(self, df: pd.DataFrame) -> str:
70
+ """
71
+ Convert a DataFrame to a Markdown string.
72
+
73
+ Args:
74
+ df (pd.DataFrame): The DataFrame to convert.
75
+
76
+ Returns:
77
+ str: The JSON string.
78
+ """
79
+ buffer = StringIO()
80
+ df = self.unique_columns(df)
81
+ df.to_markdown(buffer)
82
+ buffer.seek(0)
83
+ return buffer.getvalue()
84
+
85
+ def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
86
+ df = table.to_pandas() # convert to pandas DataFrame
87
+ df = df.dropna(axis=1, how='all')
88
+ df = df.dropna(how='all', axis=0) # Drop empty rows
89
+ page = page_number + 1
90
+ table_meta = {
91
+ "url": '',
92
+ "source": f"{path.name} Page.#{page} Table.#{table_idx}",
93
+ "filename": path.name,
94
+ "index": f"{path.name}:Table:{table_idx}",
95
+ "question": '',
96
+ "answer": '',
97
+ "type": 'table',
98
+ "data": {},
99
+ "summary": '',
100
+ "document_meta": {
101
+ "table_index": table_idx,
102
+ "table_shape": df.shape,
103
+ "table_columns": df.columns.tolist(),
104
+ "description": f"Extracted from Page.#{page}."
105
+ },
106
+ "source_type": self._source_type
107
+ }
108
+ return df, table_meta
109
+
110
+ def _load_pdf(self, path: Path) -> list:
111
+ """
112
+ Load a PDF file using the Fitz library.
113
+
114
+ Args:
115
+ path (Path): The path to the PDF file.
116
+
117
+ Returns:
118
+ list: A list of Langchain Documents.
119
+ """
120
+ if self._check_path(path):
121
+ self.logger.info(f"Loading PDF file: {path}")
122
+ pdf = fitz.open(str(path)) # Open the PDF file
123
+ docs = []
124
+ for page_number in range(pdf.page_count):
125
+ page = pdf[page_number]
126
+ try:
127
+ tabs = page.find_tables(**self.table_settings)
128
+ for tab_idx, tab in enumerate(tabs):
129
+ df, _meta = self.parse_table(tab_idx, tab, page_number, path)
130
+ ## Sample information:
131
+ print('::: Printing Table Information === ')
132
+ print(df)
133
+ print("::: Printing Column Information === ")
134
+ for column, t in df.dtypes.items():
135
+ print(column, "->", t, "->", df[column].iloc[0])
136
+ # convert into markdown:
137
+ txt = df.to_markdown()
138
+ if txt:
139
+ docs.append(
140
+ Document(page_content=txt, metadata=_meta)
141
+ )
142
+ except Exception as exc:
143
+ print(exc)
144
+ continue
145
+ return docs
parrot/loaders/ppt.py ADDED
@@ -0,0 +1,30 @@
1
+ from pathlib import PurePath
2
+ from langchain_community.document_loaders import (
3
+ UnstructuredPowerPointLoader
4
+ )
5
+ from .abstract import AbstractLoader
6
+
7
+
8
+ class PPTXLoader(AbstractLoader):
9
+ """
10
+ Loader for PPTX files.
11
+ """
12
+ _extension: list = ['.pptx']
13
+
14
+ def load(self, path: PurePath) -> list:
15
+ if self._check_path(path):
16
+ docs = []
17
+ self.logger.info(f"Loading PPTX file: {path}")
18
+ ppt_loader = UnstructuredPowerPointLoader(
19
+ file_path=str(path)
20
+ )
21
+ docs += ppt_loader.load()
22
+ for doc in docs:
23
+ doc.metadata['source_type'] = self._source_type
24
+ # Split the documents into chunks
25
+ return self.split_documents(docs)
26
+ else:
27
+ return []
28
+
29
+ def parse(self, source):
30
+ pass
parrot/loaders/qa.py ADDED
@@ -0,0 +1,81 @@
1
+
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ from collections.abc import Callable
5
+ import pandas as pd
6
+ from langchain.docstore.document import Document
7
+ from .abstract import AbstractLoader
8
+
9
+
10
+ class QAFileLoader(AbstractLoader):
11
+ """
12
+ Question and Answers File based on Excel.
13
+ """
14
+ _extension = ['.xlsx']
15
+ chunk_size = 768
16
+
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'QA',
23
+ columns: list = ['Question', 'Answer'],
24
+ **kwargs
25
+ ):
26
+ super().__init__(tokenizer, text_splitter, source_type, **kwargs)
27
+ self.path = path
28
+ self._columns = columns
29
+ if isinstance(path, str):
30
+ self.path = Path(path).resolve()
31
+ if self.path.is_dir():
32
+ raise ValueError(
33
+ f"Currently only accepting single Files."
34
+ )
35
+
36
+ def _load_document(self, path: PurePath) -> list:
37
+ if path.exists():
38
+ print('Load QA Excel File: ', path)
39
+ df = pd.read_excel(path)
40
+ q = self._columns[0]
41
+ a = self._columns[1]
42
+ docs = []
43
+ for idx, row in df.iterrows():
44
+ # Question Document
45
+ doc = Document(
46
+ page_content=f"**Question:** {row[q]}: **Answer:** {row[a]}",
47
+ metadata={
48
+ "url": '',
49
+ "index": f"{path.name} #{idx}",
50
+ "source": f"{path.name} Row.#{idx}",
51
+ "filename": f"{path.name}",
52
+ "question": row[q],
53
+ "answer": row[a],
54
+ "page_number": idx,
55
+ "source_type": self._source_type,
56
+ "type": "QA",
57
+ "summary": f"Question: {row[q]}?: **{row[a]}**",
58
+ "document_meta": {
59
+ "question": row[q],
60
+ "answer": row[a],
61
+ }
62
+ }
63
+ )
64
+ docs.append(doc)
65
+ return docs
66
+ return []
67
+
68
+ def load(self, **kwargs) -> list:
69
+ """
70
+ Load Chapters from a PDF file.
71
+
72
+ Returns:
73
+ list: A list of Langchain Documents.
74
+ """
75
+ if self.path.is_file():
76
+ documents = self._load_document(path=self.path, **kwargs)
77
+ # after all documents are retrieved, procesed and stored
78
+ return self.split_documents(documents)
79
+
80
+ def parse(self, source):
81
+ pass
parrot/loaders/repo.py ADDED
@@ -0,0 +1,103 @@
1
+ from pathlib import PurePath
2
+ from langchain_core.document_loaders.blob_loaders import Blob
3
+ from langchain_community.document_loaders.generic import GenericLoader
4
+ from langchain_community.document_loaders.parsers import LanguageParser
5
+ from langchain_community.document_loaders import (
6
+ DirectoryLoader,
7
+ TextLoader,
8
+ JSONLoader
9
+ )
10
+ from langchain_text_splitters import Language
11
+ from langchain.text_splitter import (
12
+ RecursiveCharacterTextSplitter
13
+ )
14
+ from .abstract import AbstractLoader
15
+
16
+
17
+ class RepositoryLoader(AbstractLoader):
18
+ """Repository (Code Directory) loader.
19
+ """
20
+ exclude_paths: list = [
21
+ ".venv/**",
22
+ ".venv/**/**/*",
23
+ ".git/**",
24
+ "node_modules/**",
25
+ "build/**",
26
+ "dist/**",
27
+ "templates/**",
28
+ "tmp/**"
29
+ ]
30
+
31
+ def load(self, path: PurePath, lang: str = 'python', excludes: list = []) -> list:
32
+ """
33
+ Load data from a repository and return it as a Langchain Document.
34
+ """
35
+ if isinstance(path, str):
36
+ path = PurePath(path)
37
+ if excludes:
38
+ self.exclude_paths += excludes
39
+ excludes_path = [
40
+ str(path.joinpath(p).resolve()) for p in self.exclude_paths
41
+ ]
42
+ if lang == 'python':
43
+ parser = LanguageParser(language=Language.PYTHON, parser_threshold=100)
44
+ splitter = RecursiveCharacterTextSplitter.from_language(
45
+ language=Language.PYTHON, chunk_size=1024, chunk_overlap=200
46
+ )
47
+ suffixes = [".py", ".pyx"]
48
+ glob = "**/[!.]*.py?"
49
+ elif lang == 'javascript':
50
+ parser = LanguageParser(language=Language.JS, parser_threshold=100)
51
+ splitter = RecursiveCharacterTextSplitter.from_language(
52
+ language=Language.JS, chunk_size=1024, chunk_overlap=200
53
+ )
54
+ suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
55
+ elif lang == 'typescript':
56
+ parser = LanguageParser(language=Language.TS, parser_threshold=100)
57
+ splitter = RecursiveCharacterTextSplitter.from_language(
58
+ language=Language.TS, chunk_size=1024, chunk_overlap=200
59
+ )
60
+ suffixes = [".js", ".jsx", ".json", ".ts", ".tsx"]
61
+ elif lang == 'json':
62
+ loader = DirectoryLoader(
63
+ path,
64
+ glob="**/*.json",
65
+ show_progress=True,
66
+ exclude=excludes_path,
67
+ silent_errors=True,
68
+ recursive=True,
69
+ # loader_cls=TextLoader,
70
+ loader_cls=JSONLoader,
71
+ loader_kwargs={
72
+ 'jq_schema': '.',
73
+ 'text_content': False
74
+ }
75
+ )
76
+ docs = loader.load()
77
+ for doc in docs:
78
+ doc.metadata['url'] = ''
79
+ doc.metadata['source_type'] = self._source_type
80
+ doc.metadata['language'] = lang
81
+ return self.text_splitter.split_documents(docs)
82
+ else:
83
+ raise ValueError(
84
+ f"Language {lang} not supported for Repository"
85
+ )
86
+ loader = GenericLoader.from_filesystem(
87
+ path,
88
+ glob=glob,
89
+ suffixes=suffixes,
90
+ exclude=self.exclude_paths,
91
+ parser=parser,
92
+ show_progress=True
93
+ )
94
+ docs = loader.load()
95
+ for doc in docs:
96
+ doc.metadata['url'] = ''
97
+ doc.metadata['source_type'] = self._source_type
98
+ doc.metadata['language'] = lang
99
+ documents = splitter.split_documents(docs)
100
+ return documents
101
+
102
+ def parse(self, source):
103
+ raise NotImplementedError("Parser method is not implemented for PDFLoader.")