ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (108) hide show
  1. ai_parrot-0.1.0.dist-info/LICENSE +21 -0
  2. ai_parrot-0.1.0.dist-info/METADATA +299 -0
  3. ai_parrot-0.1.0.dist-info/RECORD +108 -0
  4. ai_parrot-0.1.0.dist-info/WHEEL +5 -0
  5. ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +18 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +965 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +257 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +100 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/oddie.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +515 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +108 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +169 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +0 -0
  40. parrot/llms/abstract.py +41 -0
  41. parrot/llms/anthropic.py +36 -0
  42. parrot/llms/google.py +37 -0
  43. parrot/llms/groq.py +33 -0
  44. parrot/llms/hf.py +39 -0
  45. parrot/llms/openai.py +49 -0
  46. parrot/llms/pipes.py +103 -0
  47. parrot/llms/vertex.py +68 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/basepdf.py +102 -0
  51. parrot/loaders/basevideo.py +280 -0
  52. parrot/loaders/csv.py +42 -0
  53. parrot/loaders/dir.py +37 -0
  54. parrot/loaders/excel.py +349 -0
  55. parrot/loaders/github.py +65 -0
  56. parrot/loaders/handlers/__init__.py +5 -0
  57. parrot/loaders/handlers/data.py +213 -0
  58. parrot/loaders/image.py +119 -0
  59. parrot/loaders/json.py +52 -0
  60. parrot/loaders/pdf.py +187 -0
  61. parrot/loaders/pdfchapters.py +142 -0
  62. parrot/loaders/pdffn.py +112 -0
  63. parrot/loaders/pdfimages.py +207 -0
  64. parrot/loaders/pdfmark.py +88 -0
  65. parrot/loaders/pdftables.py +145 -0
  66. parrot/loaders/ppt.py +30 -0
  67. parrot/loaders/qa.py +81 -0
  68. parrot/loaders/repo.py +103 -0
  69. parrot/loaders/rtd.py +65 -0
  70. parrot/loaders/txt.py +92 -0
  71. parrot/loaders/utils/__init__.py +1 -0
  72. parrot/loaders/utils/models.py +25 -0
  73. parrot/loaders/video.py +96 -0
  74. parrot/loaders/videolocal.py +107 -0
  75. parrot/loaders/vimeo.py +106 -0
  76. parrot/loaders/web.py +216 -0
  77. parrot/loaders/web_base.py +112 -0
  78. parrot/loaders/word.py +125 -0
  79. parrot/loaders/youtube.py +192 -0
  80. parrot/manager.py +152 -0
  81. parrot/models.py +347 -0
  82. parrot/py.typed +0 -0
  83. parrot/stores/__init__.py +0 -0
  84. parrot/stores/abstract.py +170 -0
  85. parrot/stores/milvus.py +540 -0
  86. parrot/stores/qdrant.py +153 -0
  87. parrot/tools/__init__.py +16 -0
  88. parrot/tools/abstract.py +53 -0
  89. parrot/tools/asknews.py +32 -0
  90. parrot/tools/bing.py +13 -0
  91. parrot/tools/duck.py +62 -0
  92. parrot/tools/google.py +170 -0
  93. parrot/tools/stack.py +26 -0
  94. parrot/tools/weather.py +70 -0
  95. parrot/tools/wikipedia.py +59 -0
  96. parrot/tools/zipcode.py +179 -0
  97. parrot/utils/__init__.py +2 -0
  98. parrot/utils/parsers/__init__.py +5 -0
  99. parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
  100. parrot/utils/toml.py +11 -0
  101. parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
  102. parrot/utils/uv.py +11 -0
  103. parrot/version.py +10 -0
  104. resources/users/__init__.py +5 -0
  105. resources/users/handlers.py +13 -0
  106. resources/users/models.py +205 -0
  107. settings/__init__.py +0 -0
  108. settings/settings.py +51 -0
@@ -0,0 +1,119 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ import numpy as np
5
+ from PIL import Image
6
+ from langchain.docstore.document import Document
7
+ from transformers import CLIPModel
8
+ import torch
9
+ from torchvision import transforms
10
+ from .abstract import AbstractLoader
11
+ from ..stores.abstract import AbstractStore
12
+
13
+
14
+ class ImageLoader(AbstractLoader):
15
+ """
16
+ Image Loader.
17
+ """
18
+ _extension = ['.jpg', '.jpeg', '.png']
19
+ chunk_size = 768
20
+
21
+ def __init__(
22
+ self,
23
+ path: PurePath,
24
+ store: AbstractStore,
25
+ tokenizer: Callable[..., Any] = None,
26
+ text_splitter: Callable[..., Any] = None,
27
+ source_type: str = 'image',
28
+ **kwargs
29
+ ):
30
+ super().__init__(tokenizer, text_splitter, source_type, **kwargs)
31
+ self.path = path
32
+ if isinstance(path, str):
33
+ self.path = Path(path).resolve()
34
+ # Model:
35
+ self._model = CLIPModel.from_pretrained(
36
+ # "openai/clip-vit-base-patch32"
37
+ "openai/clip-vit-large-patch14-336"
38
+ )
39
+ # Define image preprocessing
40
+ self._preprocess = transforms.Compose(
41
+ [
42
+ transforms.Resize((336, 336)), # Adjust the size to match the model's expected input
43
+ transforms.CenterCrop(336), # Optionally add a center crop if needed
44
+ transforms.ToTensor(),
45
+ transforms.Normalize(
46
+ (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
47
+ ) # CLIP's original normalization
48
+ ]
49
+ )
50
+ # required Milvus Store:
51
+ self.store = store
52
+
53
+ def transform_image(self, img_data):
54
+ image = self._preprocess(img_data)
55
+ image = image.unsqueeze(0)
56
+ with torch.no_grad():
57
+ features = self._model.get_image_features(pixel_values=image)
58
+ embedding = features.squeeze().cpu().numpy()
59
+ return embedding.astype(np.float32)
60
+
61
+ def _insert_image(self, data):
62
+ return self.store.insert(data)
63
+
64
+ def _load_image(self, path) -> list:
65
+ """
66
+ Load an Image file.
67
+ Args:
68
+ path (Path): The path to the Image file.
69
+ Returns:
70
+ list: A list of Langchain Documents.
71
+ """
72
+ if self._check_path(path):
73
+ self.logger.info(f"Loading Image file: {path}")
74
+ img = Image.open(path).convert('RGB')
75
+ embedding = self.transform_image(img).tolist()
76
+ data={
77
+ "url": '',
78
+ "source": f"{path.name}",
79
+ "filename": path,
80
+ "question": '',
81
+ "answer": '',
82
+ "source_type": self._source_type,
83
+ "type": "image",
84
+ "text": '',
85
+ "vector": embedding,
86
+ "document_meta": {
87
+ "image": path.name,
88
+ "extension": path.suffix
89
+ }
90
+ }
91
+ self._insert_image([embedding])
92
+ return []
93
+
94
+ def load(self) -> list:
95
+ """
96
+ Load data from a Image file.
97
+ Returns:
98
+ list: A list of Langchain Documents.
99
+ """
100
+ if not self.path.exists():
101
+ raise FileNotFoundError(f"Image file/directory not found: {self.path}")
102
+ if self.path.is_dir():
103
+ # iterate over the files in the directory
104
+ for ext in self._extension:
105
+ for item in self.path.glob(f'*{ext}'):
106
+ self._load_image(item)
107
+ elif self.path.is_file():
108
+ self._load_image(self.path)
109
+ else:
110
+ raise ValueError(
111
+ f"Image Loader: Invalid path: {self.path}"
112
+ )
113
+ # Load Image loads the image directly to database.
114
+ return True
115
+
116
+ def parse(self, source):
117
+ raise NotImplementedError(
118
+ "Parser method is not implemented for ImageLoader."
119
+ )
parrot/loaders/json.py ADDED
@@ -0,0 +1,52 @@
1
+ from collections.abc import Callable
2
+ from pathlib import PurePath
3
+ from langchain_community.document_loaders import JSONLoader as JSLoader
4
+ from .abstract import AbstractLoader
5
+
6
+
7
+ class JSONLoader(AbstractLoader):
8
+ """
9
+ Loader for JSON files.
10
+ """
11
+ _extension = ['.json']
12
+ extract_metadata: Callable = None
13
+
14
+ def extract_metadata(self, record: dict, metadata: dict) -> dict:
15
+ meta = {
16
+ "source_type": self._source_type,
17
+ "priority": self._priority,
18
+ }
19
+ return meta
20
+
21
+ def load(self, path: PurePath) -> list:
22
+ """
23
+ Load data from a JSON file.
24
+
25
+ Args:
26
+ source (str): The path to the JSON file.
27
+
28
+ Returns:
29
+ list: A list of Langchain Documents.
30
+ """
31
+ if self._check_path(path):
32
+ self.logger.info(f"Loading JSON file: {path}")
33
+ # Create metadata for each chunk
34
+ meta = {
35
+ "filename": str(path),
36
+ }
37
+ args = {
38
+ "metadata_func": self.extract_metadata,
39
+ }
40
+ loader = JSLoader(
41
+ file_path=path,
42
+ jq_schema=".",
43
+ text_content=False,
44
+ **args
45
+ )
46
+ documents = loader.load()
47
+ for doc in documents:
48
+ doc.metadata.update(meta)
49
+ # Split the documents into chunks
50
+ return self.split_documents(documents)
51
+ else:
52
+ return []
parrot/loaders/pdf.py ADDED
@@ -0,0 +1,187 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ from io import BytesIO
5
+ import fitz
6
+ import pytesseract
7
+ from PIL import Image
8
+ from langchain.docstore.document import Document
9
+ from .basepdf import BasePDF
10
+
11
+
12
+ class PDFLoader(BasePDF):
13
+ """
14
+ Loader for PDF files.
15
+ """
16
+ def __init__(
17
+ self,
18
+ path: PurePath,
19
+ tokenizer: Callable[..., Any] = None,
20
+ text_splitter: Callable[..., Any] = None,
21
+ source_type: str = 'pdf',
22
+ language: str = "eng",
23
+ **kwargs
24
+ ):
25
+ super().__init__(
26
+ path=path,
27
+ tokenizer=tokenizer,
28
+ text_splitter=text_splitter,
29
+ source_type=source_type,
30
+ language=language,
31
+ **kwargs
32
+ )
33
+ self.parse_images = kwargs.get('parse_images', False)
34
+ # Table Settings:
35
+ self.table_settings = {
36
+ #"vertical_strategy": "text",
37
+ # "horizontal_strategy": "text",
38
+ "intersection_x_tolerance": 3,
39
+ "intersection_y_tolerance": 3
40
+ }
41
+ table_settings = kwargs.get('table_setttings', {})
42
+ if table_settings:
43
+ self.table_settings.update(table_settings)
44
+
45
+ def _load_pdf(self, path: Path) -> list:
46
+ """
47
+ Load a PDF file using the Fitz library.
48
+
49
+ Args:
50
+ path (Path): The path to the PDF file.
51
+
52
+ Returns:
53
+ list: A list of Langchain Documents.
54
+ """
55
+ if self._check_path(path):
56
+ self.logger.info(f"Loading PDF file: {path}")
57
+ pdf = fitz.open(str(path)) # Open the PDF file
58
+ docs = []
59
+ for page_number in range(pdf.page_count):
60
+ page = pdf[page_number]
61
+ text = page.get_text()
62
+ # first: text
63
+ if text:
64
+ page_num = page_number + 1
65
+ try:
66
+ summary = self.get_summary_from_text(text)
67
+ except Exception:
68
+ summary = ''
69
+ metadata = {
70
+ "url": '',
71
+ "source": f"{path.name} Page.#{page_num}",
72
+ "filename": path.name,
73
+ "index": f"{page_num}",
74
+ "type": 'pdf',
75
+ "question": '',
76
+ "answer": '',
77
+ "source_type": self._source_type,
78
+ "data": {},
79
+ "summary": summary,
80
+ "document_meta": {
81
+ "title": pdf.metadata.get("title", ""),
82
+ # "subject": pdf.metadata.get("subject", ""),
83
+ # "keywords": pdf.metadata.get("keywords", ""),
84
+ "creationDate": pdf.metadata.get("creationDate", ""),
85
+ # "modDate": pdf.metadata.get("modDate", ""),
86
+ # "producer": pdf.metadata.get("producer", ""),
87
+ # "creator": pdf.metadata.get("creator", ""),
88
+ "author": pdf.metadata.get("author", ""),
89
+ }
90
+ }
91
+ docs.append(
92
+ Document(
93
+ page_content=text,
94
+ metadata=metadata
95
+ )
96
+ )
97
+ # Extract images and use OCR to get text from each image
98
+ # second: images
99
+ if self.parse_images is True:
100
+ image_list = page.get_images(full=True)
101
+ file_name = path.stem.replace(' ', '_').replace('.', '').lower()
102
+ for img_index, img in enumerate(image_list):
103
+ xref = img[0]
104
+ base_image = pdf.extract_image(xref)
105
+ image = Image.open(BytesIO(base_image["image"]))
106
+ url = ''
107
+ if self.save_images is True:
108
+ img_name = f'image_{file_name}_{page_num}_{img_index}.png'
109
+ img_path = self._imgdir.joinpath(img_name)
110
+ self.logger.notice(
111
+ f"Saving Image Page on {img_path}"
112
+ )
113
+ try:
114
+ image.save(
115
+ img_path,
116
+ format="png",
117
+ optimize=True
118
+ )
119
+ url = f'/static/images/{img_name}'
120
+ except OSError:
121
+ pass
122
+ # Use Tesseract to extract text from image
123
+ image_text = pytesseract.image_to_string(
124
+ image,
125
+ lang=self._lang
126
+ )
127
+ # TODO: add the summary (explanation)
128
+ # Create a document for each image
129
+ image_meta = {
130
+ "url": url,
131
+ "source": f"{path.name} Page.#{page_num}",
132
+ "filename": path.name,
133
+ "index": f"{path.name}:{page_num}",
134
+ "question": '',
135
+ "answer": '',
136
+ "type": 'image',
137
+ "data": {},
138
+ "summary": '',
139
+ "document_meta": {
140
+ "image_index": img_index,
141
+ "image_name": img_name,
142
+ "description": f"Extracted from {page_number}."
143
+ },
144
+ "source_type": self._source_type
145
+ }
146
+ docs.append(
147
+ Document(page_content=image_text, metadata=image_meta)
148
+ )
149
+ # third: tables
150
+ # Look for tables on this page and display the table count
151
+ try:
152
+ tabs = page.find_tables()
153
+ for tab_idx, tab in enumerate(tabs):
154
+ # iterating over all tables in page:
155
+ df = tab.to_pandas() # convert to pandas DataFrame
156
+ # converting to markdown, but after pre-processing pandas
157
+ df = df.dropna(axis=1, how='all')
158
+ df = df.dropna(how='all', axis=0) # Drop empty rows
159
+ table_meta = {
160
+ "url": '',
161
+ "source": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
162
+ "filename": path.name,
163
+ "index": f"{path.name}:{page_num}",
164
+ "question": '',
165
+ "answer": '',
166
+ "type": 'table',
167
+ "data": {},
168
+ "summary": '',
169
+ "document_meta": {
170
+ "table_index": tab_idx,
171
+ "table_shape": df.shape,
172
+ "table_columns": df.columns.tolist(),
173
+ "description": f"Extracted from {page_number}."
174
+ },
175
+ "source_type": self._source_type
176
+ }
177
+ txt = df.to_markdown()
178
+ if txt:
179
+ docs.append(
180
+ Document(page_content=txt, metadata=table_meta)
181
+ )
182
+ except Exception as exc:
183
+ print(exc)
184
+ pdf.close()
185
+ return docs
186
+ else:
187
+ return []
@@ -0,0 +1,142 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List, Union
3
+ from pathlib import PurePath, Path
4
+ from io import StringIO
5
+ import fitz # PyMuPDF
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import (
8
+ RecursiveCharacterTextSplitter
9
+ )
10
+ from .basepdf import BasePDF
11
+
12
+
13
+ class PDFChapterLoader(BasePDF):
14
+ """
15
+ Preserving Chapter Structure from PDF files.
16
+ """
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Callable[..., Any] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ **kwargs
25
+ ):
26
+ super().__init__(
27
+ path=path,
28
+ tokenizer=tokenizer,
29
+ text_splitter=text_splitter,
30
+ source_type=source_type,
31
+ language=language,
32
+ **kwargs
33
+ )
34
+ # Which Font is used for titles (Chapter separation)
35
+ self.title_font: list = kwargs.get('title_font', 'Calibri-Bold')
36
+ if not text_splitter:
37
+ self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
38
+ self.tokenizer,
39
+ chunk_size=2000,
40
+ chunk_overlap=100,
41
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
42
+ strip_whitespace=True, # strips whitespace from the start and end
43
+ separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
44
+ )
45
+
46
+ def eval_title(self, title_font: str) -> bool:
47
+ """
48
+ Check if the font is a title font.
49
+
50
+ Args:
51
+ title_font (str): The font to check.
52
+
53
+ Returns:
54
+ bool: True if the font is a title font.
55
+ """
56
+ return 'Bold' in title_font or title_font == self.title_font
57
+
58
+ def _load_pdf(self, path: PurePath, **kwargs):
59
+ """
60
+ Open a PDF file.
61
+
62
+ Args:
63
+ path (PurePath): The path to the PDF file.
64
+
65
+ Returns:
66
+ pdfplumber.PDF: The PDF object.
67
+ """
68
+ pdf = fitz.open(path)
69
+ self.logger.info(f"Loading PDF file: {path}")
70
+ chapters = []
71
+ current_chapter_text = ''
72
+ current_chapter_title = ''
73
+ current_chapter_page = None
74
+ chapter_titles = set() # Keep track of unique chapter titles
75
+ for page_num in range(len(pdf)):
76
+ page = pdf.load_page(page_num)
77
+ blocks = page.get_text("dict")["blocks"]
78
+ page_number = page_num + 1
79
+ metadata = {
80
+ "url": '',
81
+ "index": f"{path.name} #{page_number}",
82
+ "source": f"{path.name} #{page_number}",
83
+ "filename": path.name,
84
+ "source_type": self._source_type,
85
+ "type": "pdf",
86
+ "question": "",
87
+ "answer": "",
88
+ "summary": '',
89
+ "document_meta": {
90
+ "page_number": page_num,
91
+ # **pdf.metadata
92
+ }
93
+ }
94
+ for b in blocks:
95
+ if b['type'] == 0: # Text block
96
+ block_text = ''
97
+ for line in b["lines"]:
98
+ for span in line["spans"]:
99
+ block_text += span['text'] # Accumulate text within the block
100
+
101
+ # Check if the block text is a title by examining the font
102
+ if any(self.eval_title(span['font']) for line in b["lines"] for span in line["spans"]):
103
+ title = block_text.strip()
104
+ if title not in chapter_titles:
105
+ # Save the current chapter if it's not empty and start a new one
106
+ if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
107
+ chapters.append({
108
+ 'chapter': current_chapter_title,
109
+ 'content': current_chapter_text.strip(),
110
+ 'page': current_chapter_page,
111
+ 'meta': metadata
112
+ })
113
+ current_chapter_title = f"**{title}**: "
114
+ current_chapter_page = page_num + 1
115
+ current_chapter_text = current_chapter_title
116
+ chapter_titles.add(title)
117
+ else:
118
+ # Continue appending to the existing chapter
119
+ current_chapter_text += block_text
120
+ else:
121
+ # Continue appending text to the current chapter
122
+ current_chapter_text += block_text
123
+
124
+ # Add a newline after processing each block, if not a chapter title
125
+ if not block_text.strip().startswith(current_chapter_title):
126
+ current_chapter_text += "\n"
127
+
128
+ # Save the last chapter if it exists and it's not just the title
129
+ if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
130
+ chapters.append({
131
+ 'chapter': current_chapter_title,
132
+ 'content': current_chapter_text.strip(),
133
+ 'page': current_chapter_page,
134
+ 'meta': metadata
135
+ })
136
+ documents = []
137
+ for chapter in chapters:
138
+ documents.append(Document(
139
+ page_content=chapter['content'],
140
+ metadata=chapter['meta']
141
+ ))
142
+ return documents
@@ -0,0 +1,112 @@
1
+ from collections.abc import Callable
2
+ from typing import Any, Optional, List, Union
3
+ from pathlib import PurePath, Path
4
+ from io import StringIO
5
+ import fitz # PyMuPDF
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import (
8
+ RecursiveCharacterTextSplitter
9
+ )
10
+ from .basepdf import BasePDF
11
+
12
+
13
+ class PDFFnLoader(BasePDF):
14
+ """
15
+ Loading a PDF with including function processing.
16
+ """
17
+ def __init__(
18
+ self,
19
+ path: PurePath,
20
+ tokenizer: Callable[..., Any] = None,
21
+ text_splitter: Union[None, Callable[..., Any]] = None,
22
+ source_type: str = 'pdf',
23
+ language: str = "eng",
24
+ **kwargs
25
+ ):
26
+ table_settings = kwargs.pop('table_settings', {})
27
+ super().__init__(
28
+ path=path,
29
+ tokenizer=tokenizer,
30
+ text_splitter=text_splitter,
31
+ source_type=source_type,
32
+ language=language,
33
+ **kwargs
34
+ )
35
+ if not text_splitter:
36
+ self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
37
+ self.tokenizer,
38
+ chunk_size=2000,
39
+ chunk_overlap=100,
40
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
41
+ strip_whitespace=True, # strips whitespace from the start and end
42
+ separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
43
+ )
44
+ self.table_settings = {
45
+ "vertical_strategy": "lines",
46
+ "horizontal_strategy": "text",
47
+ "intersection_x_tolerance": 5,
48
+ "intersection_y_tolerance": 5,
49
+ "edge_min_length": 10,
50
+ }
51
+ # Define settings for Fitz Table Processing
52
+ self.table_settings = {**self.table_settings, **table_settings}
53
+
54
+ def set_metadata(self, path, page, page_number, **kwargs) -> dict:
55
+ n = page_number + 1
56
+ return {
57
+ "url": '',
58
+ "index": f"{path.name} #{page_number}",
59
+ "source": f"{path.name} #{page_number}",
60
+ "filename": path.name,
61
+ "source_type": self._source_type,
62
+ "type": "pdf",
63
+ "question": "",
64
+ "answer": "",
65
+ "summary": '',
66
+ "document_meta": {
67
+ "page_number": n,
68
+ **kwargs
69
+ }
70
+ }
71
+
72
+ def processing_table(self, table, table_idx, page, **kwargs) -> dict:
73
+ df = table.to_pandas() # convert to pandas DataFrame
74
+ df = df.dropna(axis=1, how='all')
75
+ df = df.dropna(how='all', axis=0) # Drop empty rows
76
+ table_data = []
77
+ # Extract text from each cell
78
+ for row_idx in range(table.row_count):
79
+ for col_idx in range(table.column_count):
80
+ cell = table[row_idx][col_idx]
81
+ print('CELL ', cell)
82
+ print('---------')
83
+ cell_text = cell.get_text("text", flags=fitz.TEXTFLAGS_HTML)
84
+ print(cell_text)
85
+
86
+ return table_data
87
+
88
+ def _load_pdf(self, path: PurePath, **kwargs):
89
+ """
90
+ Open a PDF file.
91
+
92
+ Args:
93
+ path (PurePath): The path to the PDF file.
94
+
95
+ Returns:
96
+ fitz.PDF: The PDF object.
97
+ """
98
+ pdf = fitz.open(path)
99
+ self.logger.info(f"Loading PDF file: {path}")
100
+ for page_num in range(len(pdf)):
101
+ # Will extract first the table and second the block of texts
102
+ page = pdf.load_page(page_num)
103
+ parts = page.get_text("dict", flags=fitz.TEXTFLAGS_HTML)
104
+ # print('PARTS ', parts)
105
+ blocks = page.get_text("dict")["blocks"]
106
+ # print('BLOCKS >', blocks)
107
+ metadata = self.set_metadata(path, page, page_num)
108
+ # print('META > ', metadata)
109
+ tables = page.find_tables(**self.table_settings)
110
+ for tab_idx, table in enumerate(tables):
111
+ table_data = self.processing_table(table, tab_idx, page)
112
+ return []