gllm-docproc-binary 0.1.8__cp312-cp312-macosx_14_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +29 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +16 -0
- gllm_docproc/data_generator/__init__.pyi +3 -0
- gllm_docproc/data_generator/base_data_generator.pyi +19 -0
- gllm_docproc/downloader/__init__.pyi +3 -0
- gllm_docproc/downloader/base_downloader.pyi +16 -0
- gllm_docproc/downloader/html/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/html_downloader.pyi +91 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
- gllm_docproc/dpo_router/__init__.pyi +3 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +31 -0
- gllm_docproc/indexer/graph/__init__.pyi +3 -0
- gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/base_loader.pyi +31 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +3 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
- gllm_docproc/loader/loader_utils.pyi +42 -0
- gllm_docproc/loader/pdf/__init__.pyi +13 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +26 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
- gllm_docproc/model/__init__.pyi +4 -0
- gllm_docproc/model/element.pyi +37 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +29 -0
- gllm_docproc/parser/document/__init__.pyi +6 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +17 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +39 -0
- gllm_docproc/utils/__init__.pyi +0 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +121 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cpython-312-darwin.so +0 -0
- gllm_docproc.pyi +149 -0
- gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
- gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
- gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
class DictionaryUtils:
|
|
2
|
+
"""A utility class providing methods to manipulate dictionaries."""
|
|
3
|
+
@staticmethod
|
|
4
|
+
def add_or_skip_value(dictionary, key, value):
|
|
5
|
+
"""Adds a value to a dictionary if the value is not None for a given key.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dictionary (dict): The dictionary to be modified.
|
|
9
|
+
key (hashable): The key where the value needs to be added.
|
|
10
|
+
value (any): The value to be added.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict: The modified dictionary.
|
|
14
|
+
"""
|
|
15
|
+
@staticmethod
|
|
16
|
+
def append_value(dictionary, key, value):
|
|
17
|
+
"""Appends a value to a list under a specific key in a dictionary.
|
|
18
|
+
|
|
19
|
+
If the key already exists in the dictionary, the value is appended to the list under that key.
|
|
20
|
+
If the key does not exist, a new list is created with the value as its first element.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
dictionary (dict): The dictionary to be modified.
|
|
24
|
+
key (hashable): The key under which the value needs to be added.
|
|
25
|
+
value (any): The value to be appended to the list under the key.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
dict: The modified dictionary with the value appended to the list under the key.
|
|
29
|
+
"""
|
|
30
|
+
@staticmethod
|
|
31
|
+
def put_key_to_bottom(dictionary, key):
|
|
32
|
+
"""Rearange key in dictionary.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
dictionary (dict): The dictionary to be modified.
|
|
36
|
+
key (hashable): The key.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict: The modified dictionary.
|
|
40
|
+
"""
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from .html_nested_element_handler import get_element as get_element
|
|
2
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
3
|
+
from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
|
|
4
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
5
|
+
from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
|
|
6
|
+
from gllm_docproc.loader.html.utils.table_utils import TableUtils as TableUtils
|
|
7
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
8
|
+
|
|
9
|
+
def get_element_content(content_selector, removed_components: RemovedComponents) -> list[dict]:
|
|
10
|
+
'''Traverses each element to get the content.
|
|
11
|
+
|
|
12
|
+
This function extract html body recursively
|
|
13
|
+
|
|
14
|
+
Input example:
|
|
15
|
+
|
|
16
|
+
.. code-block:: html
|
|
17
|
+
|
|
18
|
+
<html>
|
|
19
|
+
<head>
|
|
20
|
+
<title>Title</title>
|
|
21
|
+
</head>
|
|
22
|
+
<body>
|
|
23
|
+
<div class="container">
|
|
24
|
+
<h1>Welcome to My Website</h1>
|
|
25
|
+
<div>
|
|
26
|
+
Hello World
|
|
27
|
+
</div>
|
|
28
|
+
</div>
|
|
29
|
+
<p>This is another paragraph.</p>
|
|
30
|
+
</body>
|
|
31
|
+
</html>
|
|
32
|
+
|
|
33
|
+
Output:
|
|
34
|
+
|
|
35
|
+
.. code-block:: groovy
|
|
36
|
+
|
|
37
|
+
[
|
|
38
|
+
{
|
|
39
|
+
\'tag\': \'body\',
|
|
40
|
+
\'class\': None,
|
|
41
|
+
\'content\': [
|
|
42
|
+
{
|
|
43
|
+
\'tag\': \'div\',
|
|
44
|
+
\'class\': \'container\',
|
|
45
|
+
\'content\': [
|
|
46
|
+
{
|
|
47
|
+
\'tag\': \'h1\',
|
|
48
|
+
\'class\': None, \'content\': [
|
|
49
|
+
{
|
|
50
|
+
\'tag\': \'text\',
|
|
51
|
+
\'content\': \'Welcome to My Website\'
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
\'tag\': \'div\',
|
|
57
|
+
\'class\': None,
|
|
58
|
+
\'content\': [
|
|
59
|
+
{\'tag\': \'text\', \'content\': \'Hello World\'}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
\'tag\': \'p\',
|
|
66
|
+
\'class\': None,
|
|
67
|
+
\'content\': [
|
|
68
|
+
{\'tag\': \'text\', \'content\': \'This is another paragraph.\'}
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
content_selector: The content to be traversed.
|
|
77
|
+
removed_components: Removed class or tags.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The List of extracted contents.
|
|
81
|
+
'''
|
|
82
|
+
def is_base_element(content_selector, removed_components: RemovedComponents) -> bool:
|
|
83
|
+
"""Check if the given content selector represents a base element.
|
|
84
|
+
|
|
85
|
+
See html_flat_base_handler.py for more information.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
content_selector: The content selector to check.
|
|
89
|
+
removed_components (RemovedComponents): An instance of RemovedComponents class.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
bool: True if the content_selector represents a base element; False otherwise.
|
|
93
|
+
"""
|
|
94
|
+
def handle_base_element(content_selector, removed_components: RemovedComponents) -> list[dict]:
|
|
95
|
+
"""Handle the processing of a base HTML element.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
content_selector: The content selector representing the HTML element.
|
|
99
|
+
removed_components (RemovedComponents): An object containing information about components to be removed.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of dict : A List of dictionaries containing information about the HTML element, or None
|
|
103
|
+
if the element should be skipped.
|
|
104
|
+
- tag: HTML tag name.
|
|
105
|
+
- class: CSS class of the element (if available).
|
|
106
|
+
Additional keys may be added based on the specific element handler.
|
|
107
|
+
"""
|
|
108
|
+
def get_handler(tag: str):
|
|
109
|
+
"""Get the element information from the specified content selector.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
tag (str): The HTML tag to get the handler for.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
dict: A dictionary containing information about the HTML element.
|
|
116
|
+
- tag: HTML tag name.
|
|
117
|
+
- class: CSS class of the element (if available).
|
|
118
|
+
Additional keys may be added based on the specific element handler
|
|
119
|
+
"""
|
|
120
|
+
def create_text_dict(message):
|
|
121
|
+
"""Creates a dictionary with 'text' tag and specified content.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
message: The content to be added to the dictionary.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A dictionary with 'text' tag and specified content.
|
|
128
|
+
"""
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
|
|
2
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
3
|
+
|
|
4
|
+
def get_element(content_selector) -> dict:
|
|
5
|
+
"""Get the element information from the specified content selector.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
content_selector: The content selector representing the HTML element.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
dict: A dictionary containing information about the HTML element.
|
|
12
|
+
- tag: HTML tag name.
|
|
13
|
+
- class: CSS class of the element (if available).
|
|
14
|
+
Additional keys may be added based on the specific element handler
|
|
15
|
+
"""
|
|
16
|
+
def get_handler(tag: str):
|
|
17
|
+
"""Gets the handler for the specified HTML tag.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
tag (str): The HTML tag to get the handler for.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Callable: The handler function for the specified HTML tag.
|
|
24
|
+
"""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .html_nested_base_handler import get_element_content as get_element_content
|
|
2
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
3
|
+
from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
|
|
4
|
+
from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head
|
|
5
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
6
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, MetaDataKeys as MetaDataKeys
|
|
7
|
+
|
|
8
|
+
class HTMLNestedLoader(HTMLBaseLoader):
|
|
9
|
+
"""A loader class for loading web content and extracting information.
|
|
10
|
+
|
|
11
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
12
|
+
extract information, and scrape data using Scrapy spiders.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
"""Initialize the HTMLNestedLoader."""
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.model.element import Element as Element
|
|
3
|
+
from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, TableConstants as TableConstants
|
|
4
|
+
|
|
5
|
+
class FlatTableUtils:
|
|
6
|
+
"""A utility class providing methods for extracting data from HTML tables."""
|
|
7
|
+
colcount: int
|
|
8
|
+
rowspans: Incomplete
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
"""Initialize the FlatTableUtils."""
|
|
11
|
+
def generate_tables(self, content: list[Element]) -> list[list[str]]:
|
|
12
|
+
"""Generate tables from HTML content.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
content (List[Element]): The list of Element instances representing the HTML content.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
List[List[str]]: A list containing the generated tables.
|
|
19
|
+
"""
|
|
20
|
+
def filter_table(self, table_content: list[Element]) -> tuple[list[Element], list[Element]]:
|
|
21
|
+
"""Filter the HTML table content.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
table_content (List[Element]): The list of Element instances representing the HTML table.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
tuple[List[Element], List[Element]]: A tuple containing the filtered table content
|
|
28
|
+
and the removed elements.
|
|
29
|
+
"""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
2
|
+
from gllm_docproc.model.element import Element as Element
|
|
3
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
|
|
4
|
+
from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, MetaDataKeys as MetaDataKeys
|
|
5
|
+
from scrapy.http import HtmlResponse as HtmlResponse
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
def is_html_content(content: str) -> bool:
|
|
9
|
+
'''Check if the provided content appears to be HTML.
|
|
10
|
+
|
|
11
|
+
This function performs a case-insensitive check to determine if the content contains HTML tags,
|
|
12
|
+
specifically by searching for the opening and closing HTML tags ("<html" and "</html>").
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
content (str): The content to check.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
bool: True if the content is identified as HTML; False otherwise.
|
|
19
|
+
'''
|
|
20
|
+
def extract_html_head(response: HtmlResponse, element_metadata: dict[str, Any] | None) -> ElementMetadata:
|
|
21
|
+
"""Extracts metadata from an HTML response.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
response (HtmlResponse): The HTML response.
|
|
25
|
+
element_metadata (dict[str, Any] | None): The element metadata.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
ElementMetadata: A class containing element metadata.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
HtmlLoadException: If an error occurs during the extraction process.
|
|
32
|
+
"""
|
|
33
|
+
def extract_html_title_tag(metadata: ElementMetadata) -> list[Element]:
|
|
34
|
+
"""Gets the title element as a Element.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
metadata (ElementMetadata): A class containing element metadata.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List[Element]: List containing a single Element instance representing the title element.
|
|
41
|
+
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class RemovedComponents:
|
|
2
|
+
"""Class representing removed components from a document.
|
|
3
|
+
|
|
4
|
+
This class defines three methods for retrieving partial class, full class, and HTML tags
|
|
5
|
+
associated with removed components.
|
|
6
|
+
"""
|
|
7
|
+
def get_partial_class(self) -> list[str]:
|
|
8
|
+
"""Get partial class.
|
|
9
|
+
|
|
10
|
+
Method to get the partial class of the removed component. Partial class consists of
|
|
11
|
+
classes that will be filtered.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
str: The partial class name associated with the removed component.
|
|
15
|
+
"""
|
|
16
|
+
def get_full_class(self) -> list[str]:
|
|
17
|
+
"""Get full class.
|
|
18
|
+
|
|
19
|
+
Method to get the full class of the removed component. Full class consists of
|
|
20
|
+
exact match of classes that will be filtered.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str: The full class name associated with the removed component.
|
|
24
|
+
"""
|
|
25
|
+
def get_html_tags(self) -> list[str]:
|
|
26
|
+
"""Method to get the HTML tags associated with the removed component.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list: A list of HTML tags associated with the removed component.
|
|
30
|
+
"""
|
|
31
|
+
@staticmethod
|
|
32
|
+
def is_removed_component(tag: str | None, class_: str | None, removed_components: RemovedComponents | None) -> bool:
|
|
33
|
+
"""Checks if a component should be removed based on its tag and class.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
tag (str): The tag of the component.
|
|
37
|
+
class_ (str): The class of the component.
|
|
38
|
+
removed_components (RemovedComponents): The components to be removed, including HTML tags and classes.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
True if the component should be removed, False otherwise.
|
|
42
|
+
"""
|
|
43
|
+
@staticmethod
|
|
44
|
+
def check_list_in_substring(message: str, check_list: list[str]) -> bool:
|
|
45
|
+
"""Checks if any substring from the check_list exists in the message string.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
message (str): The string to search for substrings.
|
|
49
|
+
check_list (list): A list of substrings to be checked.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
- bool: True if any substring from check_list is found in message, otherwise False.
|
|
53
|
+
"""
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
class StringUtils:
|
|
2
|
+
"""A utility class providing methods for text cleaning."""
|
|
3
|
+
@staticmethod
|
|
4
|
+
def clean_text(text: str | None) -> str:
|
|
5
|
+
"""Clean the input text by removing extra whitespace, newlines, and tabs.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
text (str): The text to be cleaned.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
str: The cleaned text.
|
|
12
|
+
"""
|
|
13
|
+
@staticmethod
|
|
14
|
+
def remove_extension(file_name: str) -> str:
|
|
15
|
+
"""Removes the file extension from a given file name.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
file_name (str): The name of the file from which the extension will be removed.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: File name without the extension.
|
|
22
|
+
"""
|
|
23
|
+
@staticmethod
|
|
24
|
+
def append_character(text: str, new_char: str) -> str:
|
|
25
|
+
"""Appends a character to the end of a string, handling newline endings.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text (str): The input text string to which the character will be appended.
|
|
29
|
+
new_char (str): The character to append to the text.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
str: The modified string with the appended character.
|
|
33
|
+
"""
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
|
|
3
|
+
from gllm_docproc.utils.html_constants import TableConstants as TableConstants
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class TableUtils:
|
|
7
|
+
"""A utility class providing methods for extracting data from HTML tables."""
|
|
8
|
+
colcount: int
|
|
9
|
+
table_selector: Incomplete
|
|
10
|
+
rowspans: Incomplete
|
|
11
|
+
def __init__(self, table_selector) -> None:
|
|
12
|
+
"""Initialize TableUtils with the given table selector.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
table_selector: Selector for the HTML table.
|
|
16
|
+
"""
|
|
17
|
+
def get_table(self) -> dict[str, Any]:
|
|
18
|
+
"""Extract data from the HTML table and return it as a dictionary."""
|
|
19
|
+
def extract_table(self):
|
|
20
|
+
"""Extract data from the HTML table and return it as a list of lists representing the table structure."""
|
|
21
|
+
def extract_table_row_type(self):
|
|
22
|
+
"""Extract metadata from the HTML table and return it as a list of strings representing the row types."""
|
|
23
|
+
def update_col_count(self, row_cells, prev_rowspans) -> None:
|
|
24
|
+
"""Update the number of columns in the table.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
row_cells: List of HTML cells in a row.
|
|
28
|
+
prev_rowspans: List of previous rowspans.
|
|
29
|
+
"""
|
|
30
|
+
def get_row_type(self, row):
|
|
31
|
+
"""Get the type of the row.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
row: HTML row.
|
|
35
|
+
"""
|
|
36
|
+
def extract_max_char_count(self, table):
|
|
37
|
+
"""Extract maximum character count.
|
|
38
|
+
|
|
39
|
+
Extract metadata from the HTML table and return it as a list of integers representing
|
|
40
|
+
the maximum number of characters in each column.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
table: List of lists representing the table structure.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
list: A list of integers representing the maximum number of characters in each column.
|
|
47
|
+
"""
|
|
48
|
+
@staticmethod
|
|
49
|
+
def convert_to_texts(table) -> list[str]:
|
|
50
|
+
"""Convert to texts.
|
|
51
|
+
|
|
52
|
+
This method processes table content by iterating over its metadata, handling each row based
|
|
53
|
+
on its type, and appending the result to the table data.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
table: table which will be converted to text
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
list: A list of dictionaries containing the extracted table data.
|
|
60
|
+
"""
|
|
61
|
+
@staticmethod
|
|
62
|
+
def print_row(row, col_size: Incomplete | None = None):
|
|
63
|
+
"""Formats a table row.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
row (list): The row to be formatted.
|
|
67
|
+
col_size (list | None): List of max characters size in each column
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
str: The formatted row.
|
|
71
|
+
"""
|
|
72
|
+
@staticmethod
|
|
73
|
+
def print_table_separator(row):
|
|
74
|
+
"""Formats a table separator.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
str: The formatted table separator.
|
|
78
|
+
"""
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
|
|
3
|
+
from gllm_docproc.model.element import Element as Element
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class JSONElementsLoader(BaseLoader):
|
|
7
|
+
"""JSON Elements Loader class.
|
|
8
|
+
|
|
9
|
+
This class provides a loader for extracting information from JSON files.
|
|
10
|
+
The JSON file must be in the format of list of dictionaries. where each dictionary
|
|
11
|
+
must be following the structure of Element class.
|
|
12
|
+
|
|
13
|
+
Methods:
|
|
14
|
+
load(source, element_metadata, **kwargs): Load and process a document.
|
|
15
|
+
"""
|
|
16
|
+
def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
17
|
+
"""Load and process a document.
|
|
18
|
+
|
|
19
|
+
This method loads the JSON file and returns the list of elements. If file id is provided,
|
|
20
|
+
the file id will be added to the element metadata and the chunk id and chunk relation metadata
|
|
21
|
+
that contains file id as prefix will be updated.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
source (str): The file path of the JSON file.
|
|
25
|
+
loaded_elements (Any, optional): The loaded elements. JSON Loader ignore this parameter.
|
|
26
|
+
**kwargs (Any): The keyword arguments.
|
|
27
|
+
|
|
28
|
+
Kwargs:
|
|
29
|
+
file_id (str, optional): The file id of for the elements. Defaults to None.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
list[dict[str, Any]]: The loaded elements.
|
|
33
|
+
"""
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from gllm_docproc.loader.exception import UnsupportedFileExtensionError as UnsupportedFileExtensionError
|
|
2
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
3
|
+
|
|
4
|
+
def validate_file_extension(expected_extension: str, loader_name: str):
|
|
5
|
+
"""Decorator to validate the file extension of the input file.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
expected_extension (str): The expected file extension.
|
|
9
|
+
loader_name (str): The name of the loader.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
Callable[[Callable[..., Any]], Callable[..., Any]]: A decorator that wraps the original
|
|
13
|
+
function to validate the file extension.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
UnsupportedFileExtensionError: If the file extension does not match the expected extension.
|
|
17
|
+
"""
|
|
18
|
+
def create_base_element_metadata(source: str, source_type: str) -> ElementMetadata:
|
|
19
|
+
"""Create the base element metadata.
|
|
20
|
+
|
|
21
|
+
This function creates the base element metadata for the loaded element. Base element metadata
|
|
22
|
+
includes the source, source type, and loaded datetime.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
source (str): The source of the element.
|
|
26
|
+
source_type (str): The source type.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
ElementMetadata: The base element metadata.
|
|
30
|
+
"""
|
|
31
|
+
def trim_table_empty_cells(table: list[list[str]]) -> list[list[str]]:
|
|
32
|
+
"""Trim the empty cells in the table.
|
|
33
|
+
|
|
34
|
+
This function trims the empty cells in the table by removing the empty cells at the end of each
|
|
35
|
+
row. The function also ensures that all rows have the same number of columns.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
table (List[List[str]]): A list of lists containing the table content.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List[List[str]]: A list of lists containing the trimmed table content.
|
|
42
|
+
"""
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .adobe_pdf_extract_loader import AdobePDFExtractLoader as AdobePDFExtractLoader
|
|
2
|
+
from .azure_ai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader as AzureAIDocumentIntelligenceLoader
|
|
3
|
+
from .azure_ai_document_intelligence_raw_loader import AzureAIDocumentIntelligenceRawLoader as AzureAIDocumentIntelligenceRawLoader
|
|
4
|
+
from .glair_vision_ocr_loader import GLAIRVisionOCRLoader as GLAIRVisionOCRLoader
|
|
5
|
+
from .pdf_miner_loader import PDFMinerLoader as PDFMinerLoader
|
|
6
|
+
from .pdf_miner_word_loader import PDFMinerWordLoader as PDFMinerWordLoader
|
|
7
|
+
from .pdf_plumber_loader import PDFPlumberLoader as PDFPlumberLoader
|
|
8
|
+
from .pymupdf_loader import PyMuPDFLoader as PyMuPDFLoader
|
|
9
|
+
from .pymupdf_span_loader import PyMuPDFSpanLoader as PyMuPDFSpanLoader
|
|
10
|
+
from .tabula_loader import TabulaLoader as TabulaLoader
|
|
11
|
+
from .text_inject_pdf_plumber_loader import TextInjectPDFPlumberLoader as TextInjectPDFPlumberLoader
|
|
12
|
+
|
|
13
|
+
__all__ = ['AdobePDFExtractLoader', 'AzureAIDocumentIntelligenceLoader', 'AzureAIDocumentIntelligenceRawLoader', 'GLAIRVisionOCRLoader', 'PDFMinerLoader', 'PDFMinerWordLoader', 'PDFPlumberLoader', 'PyMuPDFLoader', 'PyMuPDFSpanLoader', 'TabulaLoader', 'TextInjectPDFPlumberLoader']
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset as CloudAsset
|
|
3
|
+
from adobe.pdfservices.operation.io.stream_asset import StreamAsset as StreamAsset
|
|
4
|
+
from adobe.pdfservices.operation.pdf_services_response import PDFServicesResponse as PDFServicesResponse
|
|
5
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
6
|
+
from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
class AdobePDFExtractLoader(BaseLoader):
|
|
10
|
+
"""Adobe PDF Extract Loader class.
|
|
11
|
+
|
|
12
|
+
This class provides a loader for extracting information from PDF files using Adobe PDF Extract.
|
|
13
|
+
It implements the 'load' method to load PDF files and extract information.
|
|
14
|
+
|
|
15
|
+
Methods:
|
|
16
|
+
load(source, loaded_elements, **kwargs): Loads a PDF file and extracts information.
|
|
17
|
+
"""
|
|
18
|
+
credentials: Incomplete
|
|
19
|
+
def __init__(self, client_id: str, client_secret: str) -> None:
|
|
20
|
+
"""Initializes the Adobe PDF Extract Loader.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
client_id (str): The client ID for the Adobe PDF Extract API.
|
|
24
|
+
client_secret (str): The client secret for the Adobe PDF Extract
|
|
25
|
+
"""
|
|
26
|
+
def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
|
|
27
|
+
"""Loads a PDF file and extracts information using Adobe PDF Extract.
|
|
28
|
+
|
|
29
|
+
This method loads a PDF file and extracts information from the file using Adobe PDF Extract.
|
|
30
|
+
The extracted information is returned as a dictionary. The extracted information includes text,
|
|
31
|
+
tables, and other elements from the PDF file.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
source (str): The source PDF file to load and extract information from.
|
|
35
|
+
loaded_elements (Any): A list of loaded elements to be processed.
|
|
36
|
+
**kwargs (Any): Additional keyword arguments.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict[str, Any]: The extracted information as a dictionary
|
|
40
|
+
"""
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
3
|
+
from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
|
|
4
|
+
from gllm_docproc.loader.pdf.azure_ai_document_intelligence_raw_loader import AzureAIDocumentIntelligenceRawLoader as AzureAIDocumentIntelligenceRawLoader
|
|
5
|
+
from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
|
|
6
|
+
from gllm_docproc.model.element import Element as Element, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
|
|
7
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|
11
|
+
"""Azure AI Document Intelligence Loader class.
|
|
12
|
+
|
|
13
|
+
This class provides a loader for extracting text, tables, and images from PDF files
|
|
14
|
+
using the Azure AI Document Intelligence API. It implements the 'load' method to handle document
|
|
15
|
+
loading from a given source.
|
|
16
|
+
|
|
17
|
+
Methods:
|
|
18
|
+
load(source, loaded_elements, **kwargs): Load and process a document.
|
|
19
|
+
"""
|
|
20
|
+
endpoint: Incomplete
|
|
21
|
+
key: Incomplete
|
|
22
|
+
logger: Incomplete
|
|
23
|
+
def __init__(self, endpoint: str, key: str) -> None:
|
|
24
|
+
"""Initializes the Azure AI Document Intelligence Loader class.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
endpoint (str): The endpoint for the Azure AI Document Intelligence API.
|
|
28
|
+
key (str): The key for the Azure AI Document Intelligence API.
|
|
29
|
+
"""
|
|
30
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
31
|
+
"""Load and process a document using the Azure AI Document Intelligence API.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
source (str): The source of the document to be processed.
|
|
35
|
+
loaded_elements (list[dict[str, Any]], optional): A list of dictionaries containing loaded content and
|
|
36
|
+
metadata. Defaults to None.
|
|
37
|
+
**kwargs (Any): Additional keyword arguments for the loader.
|
|
38
|
+
|
|
39
|
+
Kwargs:
|
|
40
|
+
raw_output (dict[str, Any], optional): The raw output from the Azure AI Document Intelligence Raw Loader.
|
|
41
|
+
original_source (str, optional): The original source of the document.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
|
|
45
|
+
"""
|