ai-parrot 0.3.4__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (109) hide show
  1. ai_parrot-0.3.4.dist-info/LICENSE +21 -0
  2. ai_parrot-0.3.4.dist-info/METADATA +319 -0
  3. ai_parrot-0.3.4.dist-info/RECORD +109 -0
  4. ai_parrot-0.3.4.dist-info/WHEEL +6 -0
  5. ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +21 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +728 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +366 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +83 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/odoo.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +578 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +110 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +162 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +137 -0
  40. parrot/llms/abstract.py +47 -0
  41. parrot/llms/anthropic.py +42 -0
  42. parrot/llms/google.py +42 -0
  43. parrot/llms/groq.py +45 -0
  44. parrot/llms/hf.py +45 -0
  45. parrot/llms/openai.py +59 -0
  46. parrot/llms/pipes.py +114 -0
  47. parrot/llms/vertex.py +78 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/audio.py +106 -0
  51. parrot/loaders/basepdf.py +102 -0
  52. parrot/loaders/basevideo.py +280 -0
  53. parrot/loaders/csv.py +42 -0
  54. parrot/loaders/dir.py +37 -0
  55. parrot/loaders/excel.py +349 -0
  56. parrot/loaders/github.py +65 -0
  57. parrot/loaders/handlers/__init__.py +5 -0
  58. parrot/loaders/handlers/data.py +213 -0
  59. parrot/loaders/image.py +119 -0
  60. parrot/loaders/json.py +52 -0
  61. parrot/loaders/pdf.py +437 -0
  62. parrot/loaders/pdfchapters.py +142 -0
  63. parrot/loaders/pdffn.py +112 -0
  64. parrot/loaders/pdfimages.py +207 -0
  65. parrot/loaders/pdfmark.py +88 -0
  66. parrot/loaders/pdftables.py +145 -0
  67. parrot/loaders/ppt.py +30 -0
  68. parrot/loaders/qa.py +81 -0
  69. parrot/loaders/repo.py +103 -0
  70. parrot/loaders/rtd.py +65 -0
  71. parrot/loaders/txt.py +92 -0
  72. parrot/loaders/utils/__init__.py +1 -0
  73. parrot/loaders/utils/models.py +25 -0
  74. parrot/loaders/video.py +96 -0
  75. parrot/loaders/videolocal.py +120 -0
  76. parrot/loaders/vimeo.py +106 -0
  77. parrot/loaders/web.py +216 -0
  78. parrot/loaders/web_base.py +112 -0
  79. parrot/loaders/word.py +125 -0
  80. parrot/loaders/youtube.py +192 -0
  81. parrot/manager.py +166 -0
  82. parrot/models.py +372 -0
  83. parrot/py.typed +0 -0
  84. parrot/stores/__init__.py +48 -0
  85. parrot/stores/abstract.py +171 -0
  86. parrot/stores/milvus.py +632 -0
  87. parrot/stores/qdrant.py +153 -0
  88. parrot/tools/__init__.py +12 -0
  89. parrot/tools/abstract.py +53 -0
  90. parrot/tools/asknews.py +32 -0
  91. parrot/tools/bing.py +13 -0
  92. parrot/tools/duck.py +62 -0
  93. parrot/tools/google.py +170 -0
  94. parrot/tools/stack.py +26 -0
  95. parrot/tools/weather.py +70 -0
  96. parrot/tools/wikipedia.py +59 -0
  97. parrot/tools/zipcode.py +179 -0
  98. parrot/utils/__init__.py +2 -0
  99. parrot/utils/parsers/__init__.py +5 -0
  100. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  101. parrot/utils/toml.py +11 -0
  102. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  103. parrot/utils/uv.py +11 -0
  104. parrot/version.py +10 -0
  105. resources/users/__init__.py +5 -0
  106. resources/users/handlers.py +13 -0
  107. resources/users/models.py +205 -0
  108. settings/__init__.py +0 -0
  109. settings/settings.py +51 -0
@@ -0,0 +1,142 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List, Union
3
+ from pathlib import PurePath, Path
4
+ from io import StringIO
5
+ import fitz # PyMuPDF
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import (
8
+ RecursiveCharacterTextSplitter
9
+ )
10
+ from .basepdf import BasePDF
11
+
12
+
13
+ class PDFChapterLoader(BasePDF):
14
+ """
15
+ Preserving Chapter Structure from PDF files.
16
+ """
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ **kwargs
25
+ ):
26
+ super().__init__(
27
+ path=path,
28
+ tokenizer=tokenizer,
29
+ text_splitter=text_splitter,
30
+ source_type=source_type,
31
+ language=language,
32
+ **kwargs
33
+ )
34
+ # Which Font is used for titles (Chapter separation)
35
+ self.title_font: list = kwargs.get('title_font', 'Calibri-Bold')
36
+ if not text_splitter:
37
+ self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
38
+ self.tokenizer,
39
+ chunk_size=2000,
40
+ chunk_overlap=100,
41
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
42
+ strip_whitespace=True, # strips whitespace from the start and end
43
+ separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
44
+ )
45
+
46
+ def eval_title(self, title_font: str) -> bool:
47
+ """
48
+ Check if the font is a title font.
49
+
50
+ Args:
51
+ title_font (str): The font to check.
52
+
53
+ Returns:
54
+ bool: True if the font is a title font.
55
+ """
56
+ return 'Bold' in title_font or title_font == self.title_font
57
+
58
+ def _load_pdf(self, path: PurePath, **kwargs):
59
+ """
60
+ Open a PDF file.
61
+
62
+ Args:
63
+ path (PurePath): The path to the PDF file.
64
+
65
+ Returns:
66
+ pdfplumber.PDF: The PDF object.
67
+ """
68
+ pdf = fitz.open(path)
69
+ self.logger.info(f"Loading PDF file: {path}")
70
+ chapters = []
71
+ current_chapter_text = ''
72
+ current_chapter_title = ''
73
+ current_chapter_page = None
74
+ chapter_titles = set() # Keep track of unique chapter titles
75
+ for page_num in range(len(pdf)):
76
+ page = pdf.load_page(page_num)
77
+ blocks = page.get_text("dict")["blocks"]
78
+ page_number = page_num + 1
79
+ metadata = {
80
+ "url": '',
81
+ "index": f"{path.name} #{page_number}",
82
+ "source": f"{path.name} #{page_number}",
83
+ "filename": path.name,
84
+ "source_type": self._source_type,
85
+ "type": "pdf",
86
+ "question": "",
87
+ "answer": "",
88
+ "summary": '',
89
+ "document_meta": {
90
+ "page_number": page_num,
91
+ # **pdf.metadata
92
+ }
93
+ }
94
+ for b in blocks:
95
+ if b['type'] == 0: # Text block
96
+ block_text = ''
97
+ for line in b["lines"]:
98
+ for span in line["spans"]:
99
+ block_text += span['text'] # Accumulate text within the block
100
+
101
+ # Check if the block text is a title by examining the font
102
+ if any(self.eval_title(span['font']) for line in b["lines"] for span in line["spans"]):
103
+ title = block_text.strip()
104
+ if title not in chapter_titles:
105
+ # Save the current chapter if it's not empty and start a new one
106
+ if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
107
+ chapters.append({
108
+ 'chapter': current_chapter_title,
109
+ 'content': current_chapter_text.strip(),
110
+ 'page': current_chapter_page,
111
+ 'meta': metadata
112
+ })
113
+ current_chapter_title = f"**{title}**: "
114
+ current_chapter_page = page_num + 1
115
+ current_chapter_text = current_chapter_title
116
+ chapter_titles.add(title)
117
+ else:
118
+ # Continue appending to the existing chapter
119
+ current_chapter_text += block_text
120
+ else:
121
+ # Continue appending text to the current chapter
122
+ current_chapter_text += block_text
123
+
124
+ # Add a newline after processing each block, if not a chapter title
125
+ if not block_text.strip().startswith(current_chapter_title):
126
+ current_chapter_text += "\n"
127
+
128
+ # Save the last chapter if it exists and it's not just the title
129
+ if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
130
+ chapters.append({
131
+ 'chapter': current_chapter_title,
132
+ 'content': current_chapter_text.strip(),
133
+ 'page': current_chapter_page,
134
+ 'meta': metadata
135
+ })
136
+ documents = []
137
+ for chapter in chapters:
138
+ documents.append(Document(
139
+ page_content=chapter['content'],
140
+ metadata=chapter['meta']
141
+ ))
142
+ return documents
@@ -0,0 +1,112 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List, Union
3
+ from pathlib import PurePath, Path
4
+ from io import StringIO
5
+ import fitz # PyMuPDF
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import (
8
+ RecursiveCharacterTextSplitter
9
+ )
10
+ from .basepdf import BasePDF
11
+
12
+
13
+ class PDFFnLoader(BasePDF):
14
+ """
15
+ Loading a PDF with including function processing.
16
+ """
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Union[None, Callable[..., Any]] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ **kwargs
25
+ ):
26
+ table_settings = kwargs.pop('table_settings', {})
27
+ super().__init__(
28
+ path=path,
29
+ tokenizer=tokenizer,
30
+ text_splitter=text_splitter,
31
+ source_type=source_type,
32
+ language=language,
33
+ **kwargs
34
+ )
35
+ if not text_splitter:
36
+ self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
37
+ self.tokenizer,
38
+ chunk_size=2000,
39
+ chunk_overlap=100,
40
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
41
+ strip_whitespace=True, # strips whitespace from the start and end
42
+ separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
43
+ )
44
+ self.table_settings = {
45
+ "vertical_strategy": "lines",
46
+ "horizontal_strategy": "text",
47
+ "intersection_x_tolerance": 5,
48
+ "intersection_y_tolerance": 5,
49
+ "edge_min_length": 10,
50
+ }
51
+ # Define settings for Fitz Table Processing
52
+ self.table_settings = {**self.table_settings, **table_settings}
53
+
54
+ def set_metadata(self, path, page, page_number, **kwargs) -> dict:
55
+ n = page_number + 1
56
+ return {
57
+ "url": '',
58
+ "index": f"{path.name} #{page_number}",
59
+ "source": f"{path.name} #{page_number}",
60
+ "filename": path.name,
61
+ "source_type": self._source_type,
62
+ "type": "pdf",
63
+ "question": "",
64
+ "answer": "",
65
+ "summary": '',
66
+ "document_meta": {
67
+ "page_number": n,
68
+ **kwargs
69
+ }
70
+ }
71
+
72
+ def processing_table(self, table, table_idx, page, **kwargs) -> dict:
73
+ df = table.to_pandas() # convert to pandas DataFrame
74
+ df = df.dropna(axis=1, how='all')
75
+ df = df.dropna(how='all', axis=0) # Drop empty rows
76
+ table_data = []
77
+ # Extract text from each cell
78
+ for row_idx in range(table.row_count):
79
+ for col_idx in range(table.column_count):
80
+ cell = table[row_idx][col_idx]
81
+ print('CELL ', cell)
82
+ print('---------')
83
+ cell_text = cell.get_text("text", flags=fitz.TEXTFLAGS_HTML)
84
+ print(cell_text)
85
+
86
+ return table_data
87
+
88
+ def _load_pdf(self, path: PurePath, **kwargs):
89
+ """
90
+ Open a PDF file.
91
+
92
+ Args:
93
+ path (PurePath): The path to the PDF file.
94
+
95
+ Returns:
96
+ fitz.PDF: The PDF object.
97
+ """
98
+ pdf = fitz.open(path)
99
+ self.logger.info(f"Loading PDF file: {path}")
100
+ for page_num in range(len(pdf)):
101
+ # Will extract first the table and second the block of texts
102
+ page = pdf.load_page(page_num)
103
+ parts = page.get_text("dict", flags=fitz.TEXTFLAGS_HTML)
104
+ # print('PARTS ', parts)
105
+ blocks = page.get_text("dict")["blocks"]
106
+ # print('BLOCKS >', blocks)
107
+ metadata = self.set_metadata(path, page, page_num)
108
+ # print('META > ', metadata)
109
+ tables = page.find_tables(**self.table_settings)
110
+ for tab_idx, table in enumerate(tables):
111
+ table_data = self.processing_table(table, tab_idx, page)
112
+ return []
@@ -0,0 +1,207 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ import fitz
5
+ from pdf4llm import to_markdown
6
+ from PIL import Image
7
+ from langchain.docstore.document import Document
8
+ from langchain.text_splitter import MarkdownTextSplitter
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoProcessor,
12
+ LlavaForConditionalGeneration,
13
+ pipeline,
14
+ BitsAndBytesConfig
15
+ )
16
+ import torch
17
+ from .basepdf import BasePDF
18
+
19
+
20
+ quantization_config = BitsAndBytesConfig(
21
+ load_in_4bit=True,
22
+ bnb_4bit_compute_dtype=torch.float16
23
+ )
24
+
25
+
26
+ class PDFImageLoader(BasePDF):
27
+ """
28
+ Loader for PDF files.
29
+ """
30
+ default_prompt: str = "<|user|>\n<image>\nExplain this schematic diagram or technical installation instructions and wire diagrams, please be detailed about descriptions of steps:<|end|>\n<|assistant|>\n"
31
+ def __init__(
32
+ self,
33
+ path: PurePath,
34
+ tokenizer: Callable[..., Any] = None,
35
+ text_splitter: Callable[..., Any] = None,
36
+ source_type: str = 'pdf',
37
+ language: str = "eng",
38
+ **kwargs
39
+ ):
40
+ super().__init__(
41
+ path=path,
42
+ tokenizer=tokenizer,
43
+ text_splitter=text_splitter,
44
+ source_type=source_type,
45
+ language=language,
46
+ **kwargs
47
+ )
48
+ self._image_model = kwargs.get('image_model', 'llava-hf/llava-v1.6-vicuna-7b-hf')
49
+ self._task = kwargs.get('task', 'image-to-text')
50
+ self._max_tokens = kwargs.get('max_tokens', 600)
51
+ # Loading the model with low CPU memory usage
52
+ # model = LlavaForConditionalGeneration.from_pretrained(
53
+ # self._image_model,
54
+ # quantization_config=quantization_config,
55
+ # device_map="auto",
56
+ # torch_dtype=torch.float16,
57
+ # low_cpu_mem_usage=True
58
+ # )
59
+ # # Load the processor
60
+ # processor = AutoProcessor.from_pretrained(self._image_model, use_fast=True)
61
+ self._pipeline = pipeline(
62
+ self._task,
63
+ model=self._image_model,
64
+ # tokenizer=processor.tokenizer,
65
+ # image_processor=processor.image_processor,
66
+ model_kwargs={"quantization_config": quantization_config},
67
+ # device=self._device,
68
+ max_new_tokens=self._max_tokens,
69
+ # low_cpu_mem_usage=True,
70
+ use_fast=True
71
+ )
72
+ # default prompt
73
+ self._prompt = kwargs.get('prompt', self.default_prompt)
74
+ # Markdown Splitter
75
+ self._splitter = MarkdownTextSplitter(
76
+ chunk_size = self._chunk_size,
77
+ chunk_overlap=10
78
+ )
79
+
80
+ def pixmap_to_pil_image(self, pix):
81
+ """Converts a PyMuPDF Pixmap object to a PIL Image"""
82
+ return Image.frombytes(
83
+ "RGB",
84
+ [pix.width, pix.height],
85
+ pix.samples
86
+ )
87
+
88
+ def _load_pdf(self, path: Path) -> list:
89
+ """
90
+ Load a PDF file as Images.
91
+
92
+ Args:
93
+ path (Path): The path to the PDF file.
94
+
95
+ Returns:
96
+ list: A list of Langchain Documents.
97
+ """
98
+ if self._check_path(path):
99
+ self.logger.info(f"Loading PDF file: {path}")
100
+ pdf = fitz.open(str(path)) # Open the PDF file
101
+ docs = []
102
+ try:
103
+ # get markdown for all pages and saved separately
104
+ md_text = to_markdown(pdf)
105
+ try:
106
+ summary = self.get_summary_from_text(md_text)
107
+ except Exception:
108
+ summary = ''
109
+ metadata = {
110
+ "url": '',
111
+ "idx": str(path.name),
112
+ "filename": str(path.name),
113
+ "source": str(path.name),
114
+ "type": 'pdf',
115
+ "question": '',
116
+ "answer": '',
117
+ "data": {},
118
+ "summary": summary,
119
+ "source_type": self._source_type,
120
+ "document_meta": {
121
+ "title": pdf.metadata.get("title", ""),
122
+ "author": pdf.metadata.get("author", ""),
123
+ }
124
+ }
125
+ for idx, chunk in enumerate(self._splitter.split_text(md_text)):
126
+ _info = {
127
+ "index": f"{idx}",
128
+ **metadata
129
+ }
130
+ docs.append(
131
+ Document(
132
+ page_content=chunk,
133
+ metadata=_info
134
+ )
135
+ )
136
+ except (IndexError, ValueError) as exc:
137
+ self.logger.warning(
138
+ f"There is no text data to load on {path.name}: {exc}"
139
+ )
140
+ # Then, processing the pages one by one as Images:
141
+ file_name = path.stem.replace(" ", "_").replace(".", "_")
142
+ for page_number in range(pdf.page_count):
143
+ page_num = page_number + 1
144
+ self.logger.notice(
145
+ f"Processing PDF {path} on Page {page_num}"
146
+ )
147
+ page = pdf[page_number]
148
+ pix = page.get_pixmap(colorspace=fitz.csRGB, alpha=False)
149
+ zoom_x = 2.0 # horizontal zoom
150
+ zoom_y = 2.0 # vertical zoom
151
+ mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension
152
+ img_stream = self.pixmap_to_pil_image(pix)
153
+ url = ''
154
+ img_name = f'image_{file_name}_{page_num}.png'
155
+ if self.save_images is True:
156
+ img_path = self.save_image(
157
+ img_stream,
158
+ img_name,
159
+ self._imgdir
160
+ )
161
+ url = f'/static/images/{img_name}'
162
+ # extracting features and explanations:
163
+ outputs = self._pipeline(
164
+ img_stream,
165
+ prompt=self._prompt,
166
+ generate_kwargs={"max_new_tokens": self._max_tokens}
167
+ )
168
+ documents = []
169
+ for idx, output in enumerate(outputs):
170
+ generated_text = output['generated_text']
171
+ # Split using the special tokens, if available
172
+ split_text = generated_text.split("<|assistant|>")
173
+ prompt_text = split_text[0].replace("<|prompt|>", "").strip() if "<|prompt|>" in generated_text else ""
174
+ response_text = split_text[1].strip() if len(split_text) > 1 else ""
175
+ # Attach the image using Markdown syntax
176
+ image_markdown = f"\n\n![Image]({url})\n"
177
+ response_text += image_markdown
178
+ _meta = {
179
+ "url": f"{url}",
180
+ "filename": str(path.name),
181
+ "index": f"Page {page_num}, part: {idx}",
182
+ "source": str(path.name),
183
+ "type": 'pdf',
184
+ "question": prompt_text,
185
+ "answer": '',
186
+ "data": {},
187
+ "summary": '',
188
+ "source_type": self._source_type,
189
+ "document_meta": {
190
+ "page": f"Page {page}",
191
+ "image": f"{img_name}",
192
+ "url": f"{url}"
193
+ }
194
+ }
195
+ documents.append(
196
+ Document(
197
+ page_content=response_text,
198
+ metadata=_meta
199
+ )
200
+ )
201
+ return docs + documents
202
+
203
+ def load(self) -> list:
204
+ try:
205
+ return super().load()
206
+ finally:
207
+ self._pipeline = None
@@ -0,0 +1,88 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ import fitz
5
+ from pdf4llm import to_markdown
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import MarkdownTextSplitter
8
+ from .basepdf import BasePDF
9
+
10
+ class PDFMarkdownLoader(BasePDF):
11
+ """
12
+ Loader for PDF files converted content to markdown.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ path: PurePath,
18
+ tokenizer: Callable[..., Any] = None,
19
+ text_splitter: Callable[..., Any] = None,
20
+ source_type: str = 'pdf',
21
+ language: str = "eng",
22
+ **kwargs
23
+ ):
24
+ super().__init__(
25
+ path=path,
26
+ tokenizer=tokenizer,
27
+ text_splitter=text_splitter,
28
+ source_type=source_type,
29
+ language=language,
30
+ **kwargs
31
+ )
32
+ self._splitter = MarkdownTextSplitter(chunk_size = 1024, chunk_overlap=10)
33
+
34
+ def _load_pdf(self, path: Path) -> list:
35
+ """
36
+ Load a PDF file using the PDFMiner library.
37
+
38
+ Args:
39
+ path (Path): The path to the PDF file.
40
+
41
+ Returns:
42
+ list: A list of Langchain Documents.
43
+ """
44
+ if self._check_path(path):
45
+ self.logger.info(f"Loading PDF file: {path}")
46
+ docs = []
47
+ pdf = fitz.open(str(path))
48
+ md_text = to_markdown(pdf) # get markdown for all pages
49
+ try:
50
+ summary = self.get_summary_from_text(md_text)
51
+ except Exception:
52
+ summary = ''
53
+ metadata = {
54
+ "url": '',
55
+ "filename": path.name,
56
+ # "index": f"{path.name}",
57
+ "source": str(path.name),
58
+ "type": 'pdf',
59
+ "question": '',
60
+ "answer": '',
61
+ "data": {},
62
+ "summary": summary,
63
+ "source_type": self._source_type,
64
+ "document_meta": {
65
+ "title": pdf.metadata.get("title", ""),
66
+ # "subject": pdf.metadata.get("subject", ""),
67
+ # "keywords": pdf.metadata.get("keywords", ""),
68
+ "creationDate": pdf.metadata.get("creationDate", ""),
69
+ # "modDate": pdf.metadata.get("modDate", ""),
70
+ # "producer": pdf.metadata.get("producer", ""),
71
+ # "creator": pdf.metadata.get("creator", ""),
72
+ "author": pdf.metadata.get("author", ""),
73
+ }
74
+ }
75
+ for idx, chunk in enumerate(self._splitter.split_text(md_text)):
76
+ _info = {
77
+ "index": f"{idx}",
78
+ **metadata
79
+ }
80
+ docs.append(
81
+ Document(
82
+ page_content=chunk,
83
+ metadata=_info
84
+ )
85
+ )
86
+ return docs
87
+ else:
88
+ return []
@@ -0,0 +1,145 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List
3
+ from pathlib import Path, PurePath
4
+ from io import StringIO
5
+ import fitz
6
+ import pandas as pd
7
+ from langchain.docstore.document import Document
8
+ from .basepdf import BasePDF
9
+
10
+
11
+ class PDFTablesLoader(BasePDF):
12
+ """
13
+ Loader for Tables in PDF Files.
14
+ """
15
+ _extension = ['.pdf']
16
+
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ table_settings: dict = {},
25
+ **kwargs
26
+ ):
27
+ super().__init__(
28
+ path,
29
+ tokenizer,
30
+ text_splitter,
31
+ source_type,
32
+ language=language,
33
+ **kwargs
34
+ )
35
+ # Table Settings:
36
+ self.table_settings = {
37
+ #"vertical_strategy": "text",
38
+ # "horizontal_strategy": "text",
39
+ "intersection_x_tolerance": 5,
40
+ "intersection_y_tolerance": 5
41
+ }
42
+ if table_settings:
43
+ self.table_settings.update(table_settings)
44
+ self._skiprows = kwargs.pop('skiprows', None)
45
+
46
+ def unique_columns(self, df: pd.DataFrame) -> pd.DataFrame:
47
+ """
48
+ Rename duplicate columns in the DataFrame to ensure they are unique.
49
+
50
+ Args:
51
+ df (pd.DataFrame): The DataFrame with potential duplicate column names.
52
+
53
+ Returns:
54
+ pd.DataFrame: A DataFrame with unique column names.
55
+ """
56
+ seen = {}
57
+ new_columns = []
58
+ for col in df.columns:
59
+ new_col = col
60
+ count = seen.get(col, 0)
61
+ while new_col in new_columns:
62
+ count += 1
63
+ new_col = f"{col}_{count}"
64
+ new_columns.append(new_col)
65
+ seen[col] = count
66
+ df.columns = new_columns
67
+ return df
68
+
69
+ def get_markdown(self, df: pd.DataFrame) -> str:
70
+ """
71
+ Convert a DataFrame to a Markdown string.
72
+
73
+ Args:
74
+ df (pd.DataFrame): The DataFrame to convert.
75
+
76
+ Returns:
77
+ str: The JSON string.
78
+ """
79
+ buffer = StringIO()
80
+ df = self.unique_columns(df)
81
+ df.to_markdown(buffer)
82
+ buffer.seek(0)
83
+ return buffer.getvalue()
84
+
85
+ def parse_table(self, table_idx, table, page_number, path) -> pd.DataFrame:
86
+ df = table.to_pandas() # convert to pandas DataFrame
87
+ df = df.dropna(axis=1, how='all')
88
+ df = df.dropna(how='all', axis=0) # Drop empty rows
89
+ page = page_number + 1
90
+ table_meta = {
91
+ "url": '',
92
+ "source": f"{path.name} Page.#{page} Table.#{table_idx}",
93
+ "filename": path.name,
94
+ "index": f"{path.name}:Table:{table_idx}",
95
+ "question": '',
96
+ "answer": '',
97
+ "type": 'table',
98
+ "data": {},
99
+ "summary": '',
100
+ "document_meta": {
101
+ "table_index": table_idx,
102
+ "table_shape": df.shape,
103
+ "table_columns": df.columns.tolist(),
104
+ "description": f"Extracted from Page.#{page}."
105
+ },
106
+ "source_type": self._source_type
107
+ }
108
+ return df, table_meta
109
+
110
+ def _load_pdf(self, path: Path) -> list:
111
+ """
112
+ Load a PDF file using the Fitz library.
113
+
114
+ Args:
115
+ path (Path): The path to the PDF file.
116
+
117
+ Returns:
118
+ list: A list of Langchain Documents.
119
+ """
120
+ if self._check_path(path):
121
+ self.logger.info(f"Loading PDF file: {path}")
122
+ pdf = fitz.open(str(path)) # Open the PDF file
123
+ docs = []
124
+ for page_number in range(pdf.page_count):
125
+ page = pdf[page_number]
126
+ try:
127
+ tabs = page.find_tables(**self.table_settings)
128
+ for tab_idx, tab in enumerate(tabs):
129
+ df, _meta = self.parse_table(tab_idx, tab, page_number, path)
130
+ ## Sample information:
131
+ print('::: Printing Table Information === ')
132
+ print(df)
133
+ print("::: Printing Column Information === ")
134
+ for column, t in df.dtypes.items():
135
+ print(column, "->", t, "->", df[column].iloc[0])
136
+ # convert into markdown:
137
+ txt = df.to_markdown()
138
+ if txt:
139
+ docs.append(
140
+ Document(page_content=txt, metadata=_meta)
141
+ )
142
+ except Exception as exc:
143
+ print(exc)
144
+ continue
145
+ return docs