gllm-docproc-binary 0.1.8__cp312-cp312-manylinux_2_31_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +29 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +16 -0
- gllm_docproc/data_generator/__init__.pyi +3 -0
- gllm_docproc/data_generator/base_data_generator.pyi +19 -0
- gllm_docproc/downloader/__init__.pyi +3 -0
- gllm_docproc/downloader/base_downloader.pyi +16 -0
- gllm_docproc/downloader/html/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/html_downloader.pyi +91 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
- gllm_docproc/dpo_router/__init__.pyi +3 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +31 -0
- gllm_docproc/indexer/graph/__init__.pyi +3 -0
- gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/base_loader.pyi +31 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +3 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
- gllm_docproc/loader/loader_utils.pyi +42 -0
- gllm_docproc/loader/pdf/__init__.pyi +13 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +26 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
- gllm_docproc/model/__init__.pyi +4 -0
- gllm_docproc/model/element.pyi +37 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +29 -0
- gllm_docproc/parser/document/__init__.pyi +6 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +17 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +39 -0
- gllm_docproc/utils/__init__.pyi +0 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +121 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cpython-312-x86_64-linux-gnu.so +0 -0
- gllm_docproc.pyi +149 -0
- gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
- gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
- gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import scrapy
|
|
2
|
+
from _typeshed import Incomplete
|
|
3
|
+
from scrapy.crawler import Crawler as Crawler
|
|
4
|
+
from scrapy.http import Response as Response
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
class ScrapeSpider(scrapy.Spider):
|
|
8
|
+
"""A Scrapy spider designed to scrape content from website.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
name (str): The name of the spider - 'scrape_spider'.
|
|
12
|
+
start_urls (list): The list of URLs to start the spider.
|
|
13
|
+
allowed_domains (list): The list of allowed domains for crawling.
|
|
14
|
+
extracted_html (str): The HTML content extracted during crawling.
|
|
15
|
+
custom_settings (dict): Custom settings for the spider, including the log level and log file.
|
|
16
|
+
"""
|
|
17
|
+
name: str
|
|
18
|
+
custom_settings: dict[str, Any]
|
|
19
|
+
start_urls: Incomplete
|
|
20
|
+
allowed_domains: Incomplete
|
|
21
|
+
callback: Incomplete
|
|
22
|
+
removed_components: Incomplete
|
|
23
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
24
|
+
"""Initializes the ScrapeSpider.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
*args: Variable length argument list.
|
|
28
|
+
**kwargs: Arbitrary keyword arguments.
|
|
29
|
+
"""
|
|
30
|
+
def parse(self, response: Response, **kwargs: Any) -> None:
|
|
31
|
+
"""Parses the response obtained from the website, distinguishing between HTML content and other file types.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
response (scrapy.http.Response): The response obtained from the website.
|
|
35
|
+
**kwargs (dict[str, Any]): Additional keyword arguments.
|
|
36
|
+
"""
|
|
37
|
+
def get_content_type(self, response: Response) -> str:
|
|
38
|
+
"""Gets the content type from the response headers.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
response (scrapy.http.Response): The response object.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: The content type.
|
|
45
|
+
"""
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
|
|
48
|
+
"""Creates a new ScrapeSpider instance from the crawler.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
crawler (scrapy.crawler.Crawler): The crawler object.
|
|
52
|
+
*args: Variable length argument list.
|
|
53
|
+
**kwargs: Arbitrary keyword arguments.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
ScrapeSpider: The ScrapeSpider instance.
|
|
57
|
+
"""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from .scrape_spider import ScrapeSpider as ScrapeSpider
|
|
2
|
+
from _typeshed import Incomplete
|
|
3
|
+
from collections.abc import Generator
|
|
4
|
+
from scrapy.crawler import Crawler as Crawler
|
|
5
|
+
from scrapy.http import Response as Response
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
class ZyteScrapeSpider(ScrapeSpider):
|
|
9
|
+
"""A Scrapy spider designed to scrape content from a website using the Zyte API.
|
|
10
|
+
|
|
11
|
+
This spider is specifically tailored for scraping content from a website using the Zyte API to handle block and
|
|
12
|
+
render Javascript loaded page.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
name (str): The name of the spider - 'zyte_scrape_spider'.
|
|
16
|
+
allowed_domains (list): The list of allowed domains for crawling.
|
|
17
|
+
start_urls (list): The starting URLs for the spider to initiate crawling
|
|
18
|
+
custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
|
|
19
|
+
"""
|
|
20
|
+
name: str
|
|
21
|
+
custom_settings: dict[str, Any]
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
|
|
24
|
+
"""Create an instance of the spider from a Scrapy crawler.
|
|
25
|
+
|
|
26
|
+
This method is a class method that is called by Scrapy to create an instance of the spider
|
|
27
|
+
based on the provided Scrapy crawler and additional arguments.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
crawler: The Scrapy crawler object.
|
|
31
|
+
*args: Variable length argument list.
|
|
32
|
+
**kwargs: Variable length keyword argument list.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
WebLoaderSpider: An instance of the spider.
|
|
36
|
+
"""
|
|
37
|
+
def start_requests(self) -> Generator[Incomplete]:
|
|
38
|
+
"""Start Request.
|
|
39
|
+
|
|
40
|
+
Initiates requests for the specified start URLs using Scrapy requests with additional
|
|
41
|
+
meta information for zyte usage.
|
|
42
|
+
|
|
43
|
+
This method iterates over the start_urls list and creates a Scrapy Request for each URL.
|
|
44
|
+
The Request includes meta information to enable the browserHtml feature of the Zyte Automatic Extraction API.
|
|
45
|
+
"""
|
|
46
|
+
def parse(self, response: Response) -> None:
|
|
47
|
+
"""Parses the HTML response obtained from the website.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
response (scrapy.http.Response): The response object to parse.
|
|
51
|
+
"""
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.downloader.html.exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
|
|
3
|
+
from gllm_docproc.downloader.html.scraper.scraper.spiders import ScrapeSpider as ScrapeSpider, ZyteScrapeSpider as ZyteScrapeSpider
|
|
4
|
+
from scrapy import Spider as Spider
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
class WebScraperExecutor:
|
|
8
|
+
'''A utility class for initiating and running web scraping processes using Scrapy spiders.
|
|
9
|
+
|
|
10
|
+
This class supports multiple spider types such as PlaywrightScrapeSpider, ZyteScrapeSpider, CrawlBaseSpider,
|
|
11
|
+
and CrawlSitemapSpider. It utilizes multiprocessing to run the scraping process concurrently.
|
|
12
|
+
|
|
13
|
+
Methods:
|
|
14
|
+
__is_connected_to_internet: Checks if the system is connected to the internet.
|
|
15
|
+
get_spider_class: Gets the appropriate Scrapy spider class based on the provided spider type.
|
|
16
|
+
__init__: Initializes the WebScraperExecutor instance.
|
|
17
|
+
_crawler_results: Appends the provided item to the list of items.
|
|
18
|
+
_create_crawl_process: Creates and runs a Scrapy crawl process for a specific spider.
|
|
19
|
+
get_html_strings: Initiates the Scrapy spider and starts the scraping process using multiprocessing.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
|
|
23
|
+
'''
|
|
24
|
+
URL_INDEX: int
|
|
25
|
+
HMLT_: int
|
|
26
|
+
results: Incomplete
|
|
27
|
+
items: list[tuple[str, bytes | Exception]]
|
|
28
|
+
kwargs: Incomplete
|
|
29
|
+
spider: Incomplete
|
|
30
|
+
def __init__(self, urls: list[str] | str, **kwargs: Any) -> None:
|
|
31
|
+
"""Initializes the WebScraperExecutor instance.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
urls (List[str] | str): The URLs to be scraped.
|
|
35
|
+
**kwargs (Dict[str, Any]): Additional keyword arguments.
|
|
36
|
+
"""
|
|
37
|
+
def get_url_content_pairs(self) -> list[tuple[str, bytes | Exception]]:
|
|
38
|
+
'''Initiates the Scrapy spider and starts the scraping process using multiprocessing.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List: A list of scraped url and html content.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
|
|
45
|
+
'''
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
def is_valid_url(url: str) -> bool:
|
|
2
|
+
"""Checks if the provided URL is valid.
|
|
3
|
+
|
|
4
|
+
Args:
|
|
5
|
+
url (str): The URL to be validated.
|
|
6
|
+
|
|
7
|
+
Returns:
|
|
8
|
+
bool: True if the URL is valid; False otherwise.
|
|
9
|
+
"""
|
|
10
|
+
def clean_url(url: str) -> str:
|
|
11
|
+
"""Clean the URL so it's suitable for filename.
|
|
12
|
+
|
|
13
|
+
Sample input: https://www.bca.co.id/promo-bca
|
|
14
|
+
Sample output: https___www.bca.co.id_promo-bca
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
url (str): The URL to clean
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
str: The cleaned URL
|
|
21
|
+
"""
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class BaseDPORouter(ABC, metaclass=abc.ABCMeta):
|
|
6
|
+
"""Base class for routing in document processing."""
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def route(self, *args: Any, **kwargs: Any) -> Any:
|
|
9
|
+
"""Routes the input into different processing pipelines based on certain criteria.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
*args (Any): Variable length argument list for routing parameters.
|
|
13
|
+
**kwargs (Any): Arbitrary keyword arguments for additional routing configuration.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Any: The result of the routing process.
|
|
17
|
+
"""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
class BaseHouseKeeping(ABC, metaclass=abc.ABCMeta):
|
|
5
|
+
"""Base class for document converter."""
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def housekeeping(self, folder_path: str) -> None:
|
|
8
|
+
"""Placeholder method for performing housekeeping tasks on a specified folder.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
folder_path (str): The path to the folder to perform housekeeping on.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class BaseIndexer(ABC, metaclass=abc.ABCMeta):
|
|
6
|
+
"""Base class for document converter."""
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def index(self, elements: Any, **kwargs: Any) -> Any:
|
|
9
|
+
"""Index data from a source file into Elasticsearch.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
elements (Any): The information to be indexed. Ideally formatted as List[Dict] and
|
|
13
|
+
each Dict following the structure of model 'Element'.
|
|
14
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Any: The response from the indexing process.
|
|
18
|
+
"""
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def delete(self, **kwargs: Any) -> Any:
|
|
21
|
+
"""Delete document from a vector DB.
|
|
22
|
+
|
|
23
|
+
The arguments are not defined yet, it depends on the implementation.
|
|
24
|
+
Some vector database will require: db_url, index_name, document_id.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Any: The response from the deletion process.
|
|
31
|
+
"""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class BaseLoader(ABC, metaclass=abc.ABCMeta):
|
|
6
|
+
"""An abstract base class for document loaders.
|
|
7
|
+
|
|
8
|
+
This class defines the structure for loading and processing documents to retrieve
|
|
9
|
+
required values. Subclasses are expected to implement the 'load' method
|
|
10
|
+
to handle document loading from a given source.
|
|
11
|
+
|
|
12
|
+
Methods:
|
|
13
|
+
load(source, loaded_elements, **kwargs): Abstract method to load a document.
|
|
14
|
+
"""
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> Any:
|
|
17
|
+
"""Load and process a document.
|
|
18
|
+
|
|
19
|
+
This method is abstract and must be implemented in subclasses.
|
|
20
|
+
It defines the process of loading a document using its source.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
source (str): Might be file path, URL, the content itself.
|
|
24
|
+
loaded_elements (Any): The loaded elements from previous loaders. ideally formatted as List[Dict].
|
|
25
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Any: The loaded document, ideally formatted as List[Dict]. Each dictionary within
|
|
29
|
+
the list are recommended to follows the structure of model 'Element',
|
|
30
|
+
to ensure consistency and ease of use across Document Processing Orchestrator.
|
|
31
|
+
"""
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from .docx2python_loader import DOCX2PythonLoader as DOCX2PythonLoader
|
|
2
|
+
from .python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
|
|
3
|
+
from .python_docx_table_loader import PythonDOCXTableLoader as PythonDOCXTableLoader
|
|
4
|
+
|
|
5
|
+
__all__ = ['PythonDOCXLoader', 'PythonDOCXTableLoader', 'DOCX2PythonLoader']
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from docx2python.docx_output import DocxContent as DocxContent
|
|
3
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
4
|
+
from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
|
|
5
|
+
from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, IMAGE as IMAGE, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
|
|
6
|
+
from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
class DOCX2PythonLoader(BaseLoader):
|
|
10
|
+
"""A class for loading and processing DOCX document using docx2python library.
|
|
11
|
+
|
|
12
|
+
This class defines the structure for loading and processing DOCX document to retrieve required values
|
|
13
|
+
(text, table, image, header, footer footnote, endnote). It implements the 'load' method to handle DOCX loading
|
|
14
|
+
from a given file path.
|
|
15
|
+
|
|
16
|
+
DOCX2PythonLoader is used to extract the text, table, image, header, footer, footnote, endnote
|
|
17
|
+
from the DOCX document.
|
|
18
|
+
|
|
19
|
+
Methods:
|
|
20
|
+
load(source, loaded_elements, **kwargs): Load a DOCX document.
|
|
21
|
+
"""
|
|
22
|
+
duplicate_merged_cells: Incomplete
|
|
23
|
+
def __init__(self, duplicate_merged_cells: bool = True) -> None:
|
|
24
|
+
"""Initialize the DOCX2PythonLoader.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
duplicate_merged_cells (bool): A boolean value indicating whether to duplicate merged cells.
|
|
28
|
+
"""
|
|
29
|
+
def load(self, source: str, loaded_elements: list[dict[str, str]] | None = None, **kwargs: Any) -> list[dict[str, str]]:
|
|
30
|
+
"""Load and process a DOCX document specified by the file path and name (source).
|
|
31
|
+
|
|
32
|
+
This method defines the process of loading a DOCX document using its file path.
|
|
33
|
+
It extracts the text, table, image, header, footer, footnote, endnote from the DOCX document.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
source (str): The file path of the DOCX document.
|
|
37
|
+
loaded_elements (list[dict[str, str]] | None): A list of loaded elements containing text content
|
|
38
|
+
and metadata.
|
|
39
|
+
**kwargs (Any): Additional keyword arguments for loading the DOCX document.
|
|
40
|
+
|
|
41
|
+
Kwargs:
|
|
42
|
+
original_source (str, optional): The original source of the document.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list[dict[str, str]]: A list of dictionaries containing loaded content and metadata.
|
|
46
|
+
"""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
|
|
3
|
+
from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
|
|
4
|
+
from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
class PythonDOCXLoader(BaseLoader):
|
|
8
|
+
"""A class for loading and processing DOCX document using PythonDOCXLoader.
|
|
9
|
+
|
|
10
|
+
This class defines the structure for loading and processing DOCX document to retrieve required values
|
|
11
|
+
(Header, Body (Text and Table), Footer). It implements the 'load' method to handle DOCX loading
|
|
12
|
+
from a given file path.
|
|
13
|
+
|
|
14
|
+
PythonDOCXLoader is used to extract the Header, Body (Text and Table), Footer and metadata from the DOCX document.
|
|
15
|
+
|
|
16
|
+
Methods:
|
|
17
|
+
load(source, loaded_elements, **kwargs): Load a DOCX document.
|
|
18
|
+
"""
|
|
19
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
20
|
+
"""Load and process a DOCX document specified by the file path and name (source).
|
|
21
|
+
|
|
22
|
+
This method defines the process of loading a DOCX document using its file path.
|
|
23
|
+
It uses PythonDOCX to extract element text (with text structure) and table from the DOCX document.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The path to the DOCX document file.
|
|
27
|
+
loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
|
|
28
|
+
**kwargs (Any): Additional keyword arguments for the loader.
|
|
29
|
+
|
|
30
|
+
Kwargs:
|
|
31
|
+
original_source (str, optional): The original source of the document.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
|
|
35
|
+
"""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.docx.python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
|
|
3
|
+
from gllm_docproc.model.element import Element as Element, TABLE as TABLE
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class PythonDOCXTableLoader(BaseLoader):
|
|
7
|
+
"""Python DOCX Table Loader class to load tables from DOCX document.
|
|
8
|
+
|
|
9
|
+
This class is used to load tables from DOCX document using python-docx library.
|
|
10
|
+
Then it combined the existing loaded elements with the loaded tables.
|
|
11
|
+
|
|
12
|
+
Methods:
|
|
13
|
+
load: Load the tables from the DOCX document and combine it with the existing loaded elements.
|
|
14
|
+
_filter_table_elements: Filter the table elements from the loaded elements.
|
|
15
|
+
_get_table_content_count: Get the table content count.
|
|
16
|
+
_is_table_match: Is the table match with the merged table.
|
|
17
|
+
_find_matching_merged_table: Find the matching merged table.
|
|
18
|
+
"""
|
|
19
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
20
|
+
"""Load the tables from the DOCX document and combine it with the existing loaded elements.
|
|
21
|
+
|
|
22
|
+
This function loads the tables from the DOCX document using python-docx library.
|
|
23
|
+
Then it combined the existing loaded elements with the loaded tables.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The source file path.
|
|
27
|
+
loaded_elements (list[dict[str, Any]] | None): The existing loaded elements.
|
|
28
|
+
**kwargs (Any): The keyword arguments.
|
|
29
|
+
|
|
30
|
+
Kwargs:
|
|
31
|
+
original_source (str, optional): The original source of the document.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
list[dict[str, Any]]: The loaded elements.
|
|
35
|
+
"""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
2
|
+
from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
|
|
3
|
+
from gllm_docproc.model.element import Element as Element
|
|
4
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
5
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
6
|
+
from parsel import Selector as Selector, SelectorList as SelectorList
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
def is_base_element(content_selector: Selector | SelectorList[Selector] | None, removed_components: RemovedComponents) -> bool:
|
|
10
|
+
"""Check if the given content selector represents a base element.
|
|
11
|
+
|
|
12
|
+
A base element is determined by the type of element in the HTML document.
|
|
13
|
+
Supported base elements include:
|
|
14
|
+
1. Unsupported result (if content_selector is None)
|
|
15
|
+
2. String text
|
|
16
|
+
3. Removed Components (by class or tag) defined in RemovedComponents
|
|
17
|
+
4. <input>
|
|
18
|
+
5. <svg> image
|
|
19
|
+
6. <img>
|
|
20
|
+
7. <audio>, <video> (if multiple sources are given, select only the first one)
|
|
21
|
+
8. <iframe> (cannot get the content of the iframe)
|
|
22
|
+
9. <embed> (cannot get the content of the embed)
|
|
23
|
+
10. <br>
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
|
|
27
|
+
removed_components (RemovedComponents): Components to be removed from processing.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
bool: True if the content_selector represents a base element; False otherwise.
|
|
31
|
+
"""
|
|
32
|
+
def handle_base_element(content_selector: Selector | SelectorList[Selector] | None, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
|
|
33
|
+
"""Handle the base HTML element and generate Element instances.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
|
|
37
|
+
html_head (ElementMetadata): The metadata extracted from the HTML head.
|
|
38
|
+
removed_components (RemovedComponents): Components to be removed from processing.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
list[Element]: A list of Element instances generated from the HTML content.
|
|
42
|
+
"""
|
|
43
|
+
def get_handler(tag: str) -> Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None:
|
|
44
|
+
"""Get the handler function corresponding to the given HTML tag.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
tag (str): The HTML tag for which the handler function is requested.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None: The handler
|
|
51
|
+
function corresponding to the given HTML tag.
|
|
52
|
+
"""
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
2
|
+
from gllm_docproc.loader.html.flat.html_flat_base_handler import handle_base_element as handle_base_element, is_base_element as is_base_element
|
|
3
|
+
from gllm_docproc.loader.html.flat.html_flat_merger import merge_html_elements as merge_html_elements
|
|
4
|
+
from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
|
|
5
|
+
from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head, extract_html_title_tag as extract_html_title_tag
|
|
6
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
7
|
+
from gllm_docproc.model.element import Element as Element
|
|
8
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
9
|
+
from parsel import Selector as Selector, SelectorList as SelectorList
|
|
10
|
+
|
|
11
|
+
class HTMLFlatLoader(HTMLBaseLoader):
|
|
12
|
+
"""A loader class for loading web content and extracting information.
|
|
13
|
+
|
|
14
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
15
|
+
extract information, and scrape data using Scrapy spiders.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
"""Initialize the HTMLFlatLoader."""
|
|
19
|
+
@classmethod
|
|
20
|
+
def extract_html_element(cls, content_selector: SelectorList[Selector] | Selector, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
|
|
21
|
+
"""Recursively extract the content of an HTML element.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
content_selector (SelectorList[Selector] | Selector): The content selector.
|
|
25
|
+
html_head (ElementMetadata): The HTML head metadata.
|
|
26
|
+
removed_components (RemovedComponents): The removed components.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list[Element]: A list of web elements.
|
|
30
|
+
"""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.utils.flat_table_utils import FlatTableUtils as FlatTableUtils
|
|
2
|
+
from gllm_docproc.model.element import Element as Element
|
|
3
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
4
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys
|
|
5
|
+
from parsel import Selector as Selector, SelectorList as SelectorList
|
|
6
|
+
|
|
7
|
+
def merge_html_elements(content_selector: Selector | SelectorList[Selector], contents: list[Element], html_head: ElementMetadata) -> list[Element]:
|
|
8
|
+
"""For non-base element, add metadata and merge children into one with parent element.
|
|
9
|
+
|
|
10
|
+
1. Add its HTML tag into its metadata
|
|
11
|
+
2. For some HTML tags, combine its children into a single element into the parent, for example:
|
|
12
|
+
1. Combine <ul> / <ol> children into a single element
|
|
13
|
+
2. Combine <a> children to become [text](https://link.com)
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
content_selector(Selector | SelectorList[Selector]): The content selector representing the HTML element.
|
|
17
|
+
contents (list[Element]): list of Element instances representing the contents of the HTML element.
|
|
18
|
+
html_head (ElementMetadata): The metadata extracted from the HTML head.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
list[Element]: list of Element instances after handling the contents based on the parent tag.
|
|
22
|
+
"""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.html.utils.html_utils import is_html_content as is_html_content
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class HTMLBaseLoader(BaseLoader):
|
|
6
|
+
"""A loader class for loading web content and extracting information.
|
|
7
|
+
|
|
8
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
9
|
+
extract information, and scrape data using Scrapy spiders.
|
|
10
|
+
"""
|
|
11
|
+
URL_INDEX: int
|
|
12
|
+
CONTENT_INDEX: int
|
|
13
|
+
def __init__(self, load_from_html_string: Any) -> None:
|
|
14
|
+
"""Initialize the HTMLBaseLoader."""
|
|
15
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
16
|
+
"""Loads web content and returns the extracted information in JSON format.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
source (str): The source of the web content, either a URL or a file path.
|
|
20
|
+
loaded_elements (list[dict]): A list of loaded elements to be processed.
|
|
21
|
+
**kwargs (dict[str, Any]): Additional keyword arguments.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list[dict]: The extracted information in JSON format.
|
|
25
|
+
"""
|