gllm-docproc-binary 0.1.8__cp312-cp312-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (123) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +29 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +16 -0
  11. gllm_docproc/data_generator/__init__.pyi +3 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +19 -0
  13. gllm_docproc/downloader/__init__.pyi +3 -0
  14. gllm_docproc/downloader/base_downloader.pyi +16 -0
  15. gllm_docproc/downloader/html/__init__.pyi +4 -0
  16. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  17. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  18. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  19. gllm_docproc/downloader/html/html_downloader.pyi +91 -0
  20. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  21. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  22. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  23. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  24. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
  25. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  26. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
  27. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  28. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  29. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  30. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
  31. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  32. gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
  33. gllm_docproc/dpo_router/__init__.pyi +3 -0
  34. gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
  35. gllm_docproc/housekeeping/__init__.pyi +3 -0
  36. gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
  37. gllm_docproc/indexer/__init__.pyi +3 -0
  38. gllm_docproc/indexer/base_indexer.pyi +31 -0
  39. gllm_docproc/indexer/graph/__init__.pyi +3 -0
  40. gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
  41. gllm_docproc/loader/__init__.pyi +4 -0
  42. gllm_docproc/loader/audio/__init__.pyi +3 -0
  43. gllm_docproc/loader/base_loader.pyi +31 -0
  44. gllm_docproc/loader/docx/__init__.pyi +5 -0
  45. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  46. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  47. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  48. gllm_docproc/loader/exception/__init__.pyi +3 -0
  49. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  50. gllm_docproc/loader/html/__init__.pyi +5 -0
  51. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  52. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  53. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  54. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
  55. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  56. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  57. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  58. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  59. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  60. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  61. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  62. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  63. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  64. gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
  65. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  66. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  67. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  68. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  69. gllm_docproc/loader/json/__init__.pyi +3 -0
  70. gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
  71. gllm_docproc/loader/loader_utils.pyi +42 -0
  72. gllm_docproc/loader/pdf/__init__.pyi +13 -0
  73. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
  74. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
  75. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
  76. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  77. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  78. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  79. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  80. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
  81. gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
  82. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
  83. gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
  84. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  85. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  86. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  87. gllm_docproc/loader/txt/__init__.pyi +3 -0
  88. gllm_docproc/loader/txt/txt_loader.pyi +26 -0
  89. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  90. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
  91. gllm_docproc/model/__init__.pyi +4 -0
  92. gllm_docproc/model/element.pyi +37 -0
  93. gllm_docproc/model/element_metadata.pyi +35 -0
  94. gllm_docproc/parser/__init__.pyi +4 -0
  95. gllm_docproc/parser/base_parser.pyi +29 -0
  96. gllm_docproc/parser/document/__init__.pyi +6 -0
  97. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  98. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  99. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  100. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  101. gllm_docproc/parser/html/__init__.pyi +4 -0
  102. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  103. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  104. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  105. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  106. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  107. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  108. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  109. gllm_docproc/parser/table/__init__.pyi +3 -0
  110. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  111. gllm_docproc/request_handler/__init__.pyi +3 -0
  112. gllm_docproc/request_handler/base_request_handler.pyi +17 -0
  113. gllm_docproc/response_handler/__init__.pyi +3 -0
  114. gllm_docproc/response_handler/base_response_handler.pyi +39 -0
  115. gllm_docproc/utils/__init__.pyi +0 -0
  116. gllm_docproc/utils/file_utils.pyi +76 -0
  117. gllm_docproc/utils/html_constants.pyi +121 -0
  118. gllm_docproc.build/.gitignore +1 -0
  119. gllm_docproc.cpython-312-darwin.so +0 -0
  120. gllm_docproc.pyi +149 -0
  121. gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
  122. gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
  123. gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
@@ -0,0 +1,57 @@
1
+ import scrapy
2
+ from _typeshed import Incomplete
3
+ from scrapy.crawler import Crawler as Crawler
4
+ from scrapy.http import Response as Response
5
+ from typing import Any
6
+
7
+ class ScrapeSpider(scrapy.Spider):
8
+ """A Scrapy spider designed to scrape content from website.
9
+
10
+ Attributes:
11
+ name (str): The name of the spider - 'scrape_spider'.
12
+ start_urls (list): The list of URLs to start the spider.
13
+ allowed_domains (list): The list of allowed domains for crawling.
14
+ extracted_html (str): The HTML content extracted during crawling.
15
+ custom_settings (dict): Custom settings for the spider, including the log level and log file.
16
+ """
17
+ name: str
18
+ custom_settings: dict[str, Any]
19
+ start_urls: Incomplete
20
+ allowed_domains: Incomplete
21
+ callback: Incomplete
22
+ removed_components: Incomplete
23
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
24
+ """Initializes the ScrapeSpider.
25
+
26
+ Args:
27
+ *args: Variable length argument list.
28
+ **kwargs: Arbitrary keyword arguments.
29
+ """
30
+ def parse(self, response: Response, **kwargs: Any) -> None:
31
+ """Parses the response obtained from the website, distinguishing between HTML content and other file types.
32
+
33
+ Args:
34
+ response (scrapy.http.Response): The response obtained from the website.
35
+ **kwargs (dict[str, Any]): Additional keyword arguments.
36
+ """
37
+ def get_content_type(self, response: Response) -> str:
38
+ """Gets the content type from the response headers.
39
+
40
+ Args:
41
+ response (scrapy.http.Response): The response object.
42
+
43
+ Returns:
44
+ str: The content type.
45
+ """
46
+ @classmethod
47
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
48
+ """Creates a new ScrapeSpider instance from the crawler.
49
+
50
+ Args:
51
+ crawler (scrapy.crawler.Crawler): The crawler object.
52
+ *args: Variable length argument list.
53
+ **kwargs: Arbitrary keyword arguments.
54
+
55
+ Returns:
56
+ ScrapeSpider: The ScrapeSpider instance.
57
+ """
@@ -0,0 +1,51 @@
1
+ from .scrape_spider import ScrapeSpider as ScrapeSpider
2
+ from _typeshed import Incomplete
3
+ from collections.abc import Generator
4
+ from scrapy.crawler import Crawler as Crawler
5
+ from scrapy.http import Response as Response
6
+ from typing import Any
7
+
8
+ class ZyteScrapeSpider(ScrapeSpider):
9
+ """A Scrapy spider designed to scrape content from a website using the Zyte API.
10
+
11
+ This spider is specifically tailored for scraping content from a website using the Zyte API to handle block and
12
+ render Javascript loaded page.
13
+
14
+ Attributes:
15
+ name (str): The name of the spider - 'zyte_scrape_spider'.
16
+ allowed_domains (list): The list of allowed domains for crawling.
17
+ start_urls (list): The starting URLs for the spider to initiate crawling
18
+ custom_settings (dict): Custom settings for the spider, including item pipelines configuration.
19
+ """
20
+ name: str
21
+ custom_settings: dict[str, Any]
22
+ @classmethod
23
+ def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any):
24
+ """Create an instance of the spider from a Scrapy crawler.
25
+
26
+ This method is a class method that is called by Scrapy to create an instance of the spider
27
+ based on the provided Scrapy crawler and additional arguments.
28
+
29
+ Args:
30
+ crawler: The Scrapy crawler object.
31
+ *args: Variable length argument list.
32
+ **kwargs: Variable length keyword argument list.
33
+
34
+ Returns:
35
+ WebLoaderSpider: An instance of the spider.
36
+ """
37
+ def start_requests(self) -> Generator[Incomplete]:
38
+ """Start Request.
39
+
40
+ Initiates requests for the specified start URLs using Scrapy requests with additional
41
+ meta information for zyte usage.
42
+
43
+ This method iterates over the start_urls list and creates a Scrapy Request for each URL.
44
+ The Request includes meta information to enable the browserHtml feature of the Zyte Automatic Extraction API.
45
+ """
46
+ def parse(self, response: Response) -> None:
47
+ """Parses the HTML response obtained from the website.
48
+
49
+ Args:
50
+ response (scrapy.http.Response): The response object to parse.
51
+ """
@@ -0,0 +1,45 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader.html.exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
3
+ from gllm_docproc.downloader.html.scraper.scraper.spiders import ScrapeSpider as ScrapeSpider, ZyteScrapeSpider as ZyteScrapeSpider
4
+ from scrapy import Spider as Spider
5
+ from typing import Any
6
+
7
+ class WebScraperExecutor:
8
+ '''A utility class for initiating and running web scraping processes using Scrapy spiders.
9
+
10
+ This class supports multiple spider types such as PlaywrightScrapeSpider, ZyteScrapeSpider, CrawlBaseSpider,
11
+ and CrawlSitemapSpider. It utilizes multiprocessing to run the scraping process concurrently.
12
+
13
+ Methods:
14
+ __is_connected_to_internet: Checks if the system is connected to the internet.
15
+ get_spider_class: Gets the appropriate Scrapy spider class based on the provided spider type.
16
+ __init__: Initializes the WebScraperExecutor instance.
17
+ _crawler_results: Appends the provided item to the list of items.
18
+ _create_crawl_process: Creates and runs a Scrapy crawl process for a specific spider.
19
+ get_html_strings: Initiates the Scrapy spider and starts the scraping process using multiprocessing.
20
+
21
+ Raises:
22
+ ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
23
+ '''
24
+ URL_INDEX: int
25
+ HMLT_: int
26
+ results: Incomplete
27
+ items: list[tuple[str, bytes | Exception]]
28
+ kwargs: Incomplete
29
+ spider: Incomplete
30
+ def __init__(self, urls: list[str] | str, **kwargs: Any) -> None:
31
+ """Initializes the WebScraperExecutor instance.
32
+
33
+ Args:
34
+ urls (List[str] | str): The URLs to be scraped.
35
+ **kwargs (Dict[str, Any]): Additional keyword arguments.
36
+ """
37
+ def get_url_content_pairs(self) -> list[tuple[str, bytes | Exception]]:
38
+ '''Initiates the Scrapy spider and starts the scraping process using multiprocessing.
39
+
40
+ Returns:
41
+ List: A list of scraped url and html content.
42
+
43
+ Raises:
44
+ ZyteApiKeyNotProvidedException: If the spider is "zyte" but the Zyte API key is not provided.
45
+ '''
@@ -0,0 +1,3 @@
1
+ from .web_utils import clean_url as clean_url, is_valid_url as is_valid_url
2
+
3
+ __all__ = ['is_valid_url', 'clean_url']
@@ -0,0 +1,21 @@
1
+ def is_valid_url(url: str) -> bool:
2
+ """Checks if the provided URL is valid.
3
+
4
+ Args:
5
+ url (str): The URL to be validated.
6
+
7
+ Returns:
8
+ bool: True if the URL is valid; False otherwise.
9
+ """
10
+ def clean_url(url: str) -> str:
11
+ """Clean the URL so it's suitable for filename.
12
+
13
+ Sample input: https://www.bca.co.id/promo-bca
14
+ Sample output: https___www.bca.co.id_promo-bca
15
+
16
+ Args:
17
+ url (str): The URL to clean
18
+
19
+ Returns:
20
+ str: The cleaned URL
21
+ """
@@ -0,0 +1,3 @@
1
+ from .base_dpo_router import BaseDPORouter as BaseDPORouter
2
+
3
+ __all__ = ['BaseDPORouter']
@@ -0,0 +1,17 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseDPORouter(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for routing in document processing."""
7
+ @abstractmethod
8
+ def route(self, *args: Any, **kwargs: Any) -> Any:
9
+ """Routes the input into different processing pipelines based on certain criteria.
10
+
11
+ Args:
12
+ *args (Any): Variable length argument list for routing parameters.
13
+ **kwargs (Any): Arbitrary keyword arguments for additional routing configuration.
14
+
15
+ Returns:
16
+ Any: The result of the routing process.
17
+ """
@@ -0,0 +1,3 @@
1
+ from .base_housekeeping import BaseHouseKeeping as BaseHouseKeeping
2
+
3
+ __all__ = ['BaseHouseKeeping']
@@ -0,0 +1,15 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+
4
+ class BaseHouseKeeping(ABC, metaclass=abc.ABCMeta):
5
+ """Base class for document converter."""
6
+ @abstractmethod
7
+ def housekeeping(self, folder_path: str) -> None:
8
+ """Placeholder method for performing housekeeping tasks on a specified folder.
9
+
10
+ Args:
11
+ folder_path (str): The path to the folder to perform housekeeping on.
12
+
13
+ Returns:
14
+ None
15
+ """
@@ -0,0 +1,3 @@
1
+ from gllm_docproc.indexer.base_indexer import BaseIndexer as BaseIndexer
2
+
3
+ __all__ = ['BaseIndexer']
@@ -0,0 +1,31 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseIndexer(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for document converter."""
7
+ @abstractmethod
8
+ def index(self, elements: Any, **kwargs: Any) -> Any:
9
+ """Index data from a source file into Elasticsearch.
10
+
11
+ Args:
12
+ elements (Any): The information to be indexed. Ideally formatted as List[Dict] and
13
+ each Dict following the structure of model 'Element'.
14
+ **kwargs (Any): Additional keyword arguments for customization.
15
+
16
+ Returns:
17
+ Any: The response from the indexing process.
18
+ """
19
+ @abstractmethod
20
+ def delete(self, **kwargs: Any) -> Any:
21
+ """Delete document from a vector DB.
22
+
23
+ The arguments are not defined yet, it depends on the implementation.
24
+ Some vector database will require: db_url, index_name, document_id.
25
+
26
+ Args:
27
+ **kwargs (Any): Additional keyword arguments for customization.
28
+
29
+ Returns:
30
+ Any: The response from the deletion process.
31
+ """
@@ -0,0 +1,3 @@
1
+ from .llama_index_graph_rag_indexer import LlamaIndexGraphRAGIndexer as LlamaIndexGraphRAGIndexer
2
+
3
+ __all__ = ['LlamaIndexGraphRAGIndexer']
@@ -0,0 +1,4 @@
1
+ from gllm_docproc.indexer.knowledge_graph.base_kg_indexer import BaseKGIndexer as BaseKGIndexer
2
+ from gllm_docproc.indexer.knowledge_graph.llama_index_kg_indexer import LlamaIndexKGIndexer as LlamaIndexKGIndexer
3
+
4
+ __all__ = ['BaseKGIndexer', 'LlamaIndexKGIndexer']
@@ -0,0 +1,4 @@
1
+ from .base_loader import BaseLoader as BaseLoader
2
+ from .pipeline_loader import PipelineLoader as PipelineLoader
3
+
4
+ __all__ = ['BaseLoader', 'PipelineLoader']
@@ -0,0 +1,3 @@
1
+ from .audio_loader import AudioLoader as AudioLoader
2
+
3
+ __all__ = ['AudioLoader']
@@ -0,0 +1,31 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseLoader(ABC, metaclass=abc.ABCMeta):
6
+ """An abstract base class for document loaders.
7
+
8
+ This class defines the structure for loading and processing documents to retrieve
9
+ required values. Subclasses are expected to implement the 'load' method
10
+ to handle document loading from a given source.
11
+
12
+ Methods:
13
+ load(source, loaded_elements, **kwargs): Abstract method to load a document.
14
+ """
15
+ @abstractmethod
16
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> Any:
17
+ """Load and process a document.
18
+
19
+ This method is abstract and must be implemented in subclasses.
20
+ It defines the process of loading a document using its source.
21
+
22
+ Args:
23
+ source (str): Might be file path, URL, the content itself.
24
+ loaded_elements (Any): The loaded elements from previous loaders. ideally formatted as List[Dict].
25
+ **kwargs (Any): Additional keyword arguments for customization.
26
+
27
+ Returns:
28
+ Any: The loaded document, ideally formatted as List[Dict]. Each dictionary within
29
+ the list are recommended to follows the structure of model 'Element',
30
+ to ensure consistency and ease of use across Document Processing Orchestrator.
31
+ """
@@ -0,0 +1,5 @@
1
+ from .docx2python_loader import DOCX2PythonLoader as DOCX2PythonLoader
2
+ from .python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
3
+ from .python_docx_table_loader import PythonDOCXTableLoader as PythonDOCXTableLoader
4
+
5
+ __all__ = ['PythonDOCXLoader', 'PythonDOCXTableLoader', 'DOCX2PythonLoader']
@@ -0,0 +1,46 @@
1
+ from _typeshed import Incomplete
2
+ from docx2python.docx_output import DocxContent as DocxContent
3
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
4
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
5
+ from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, IMAGE as IMAGE, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
7
+ from typing import Any
8
+
9
+ class DOCX2PythonLoader(BaseLoader):
10
+ """A class for loading and processing DOCX document using docx2python library.
11
+
12
+ This class defines the structure for loading and processing DOCX document to retrieve required values
13
+ (text, table, image, header, footer footnote, endnote). It implements the 'load' method to handle DOCX loading
14
+ from a given file path.
15
+
16
+ DOCX2PythonLoader is used to extract the text, table, image, header, footer, footnote, endnote
17
+ from the DOCX document.
18
+
19
+ Methods:
20
+ load(source, loaded_elements, **kwargs): Load a DOCX document.
21
+ """
22
+ duplicate_merged_cells: Incomplete
23
+ def __init__(self, duplicate_merged_cells: bool = True) -> None:
24
+ """Initialize the DOCX2PythonLoader.
25
+
26
+ Args:
27
+ duplicate_merged_cells (bool): A boolean value indicating whether to duplicate merged cells.
28
+ """
29
+ def load(self, source: str, loaded_elements: list[dict[str, str]] | None = None, **kwargs: Any) -> list[dict[str, str]]:
30
+ """Load and process a DOCX document specified by the file path and name (source).
31
+
32
+ This method defines the process of loading a DOCX document using its file path.
33
+ It extracts the text, table, image, header, footer, footnote, endnote from the DOCX document.
34
+
35
+ Args:
36
+ source (str): The file path of the DOCX document.
37
+ loaded_elements (list[dict[str, str]] | None): A list of loaded elements containing text content
38
+ and metadata.
39
+ **kwargs (Any): Additional keyword arguments for loading the DOCX document.
40
+
41
+ Kwargs:
42
+ original_source (str, optional): The original source of the document.
43
+
44
+ Returns:
45
+ list[dict[str, str]]: A list of dictionaries containing loaded content and metadata.
46
+ """
@@ -0,0 +1,35 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
5
+ from typing import Any
6
+
7
+ class PythonDOCXLoader(BaseLoader):
8
+ """A class for loading and processing DOCX document using PythonDOCXLoader.
9
+
10
+ This class defines the structure for loading and processing DOCX document to retrieve required values
11
+ (Header, Body (Text and Table), Footer). It implements the 'load' method to handle DOCX loading
12
+ from a given file path.
13
+
14
+ PythonDOCXLoader is used to extract the Header, Body (Text and Table), Footer and metadata from the DOCX document.
15
+
16
+ Methods:
17
+ load(source, loaded_elements, **kwargs): Load a DOCX document.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load and process a DOCX document specified by the file path and name (source).
21
+
22
+ This method defines the process of loading a DOCX document using its file path.
23
+ It uses PythonDOCX to extract element text (with text structure) and table from the DOCX document.
24
+
25
+ Args:
26
+ source (str): The path to the DOCX document file.
27
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
28
+ **kwargs (Any): Additional keyword arguments for the loader.
29
+
30
+ Kwargs:
31
+ original_source (str, optional): The original source of the document.
32
+
33
+ Returns:
34
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
35
+ """
@@ -0,0 +1,35 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.docx.python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
3
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
4
+ from typing import Any
5
+
6
+ class PythonDOCXTableLoader(BaseLoader):
7
+ """Python DOCX Table Loader class to load tables from DOCX document.
8
+
9
+ This class is used to load tables from DOCX document using python-docx library.
10
+ Then it combined the existing loaded elements with the loaded tables.
11
+
12
+ Methods:
13
+ load: Load the tables from the DOCX document and combine it with the existing loaded elements.
14
+ _filter_table_elements: Filter the table elements from the loaded elements.
15
+ _get_table_content_count: Get the table content count.
16
+ _is_table_match: Is the table match with the merged table.
17
+ _find_matching_merged_table: Find the matching merged table.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load the tables from the DOCX document and combine it with the existing loaded elements.
21
+
22
+ This function loads the tables from the DOCX document using python-docx library.
23
+ Then it combined the existing loaded elements with the loaded tables.
24
+
25
+ Args:
26
+ source (str): The source file path.
27
+ loaded_elements (list[dict[str, Any]] | None): The existing loaded elements.
28
+ **kwargs (Any): The keyword arguments.
29
+
30
+ Kwargs:
31
+ original_source (str, optional): The original source of the document.
32
+
33
+ Returns:
34
+ list[dict[str, Any]]: The loaded elements.
35
+ """
@@ -0,0 +1,3 @@
1
+ from .unsupported_file_extension_error import UnsupportedFileExtensionError as UnsupportedFileExtensionError
2
+
3
+ __all__ = ['UnsupportedFileExtensionError']
@@ -0,0 +1,7 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class UnsupportedFileExtensionError(Exception):
4
+ """An exception for unsupported file extension."""
5
+ message: Incomplete
6
+ def __init__(self, ext: str, loader_name: str) -> None:
7
+ """Initialize the exception."""
@@ -0,0 +1,5 @@
1
+ from .flat import HTMLFlatLoader as HTMLFlatLoader
2
+ from .html_base_loader import HTMLBaseLoader as HTMLBaseLoader
3
+ from .nested import HTMLNestedLoader as HTMLNestedLoader
4
+
5
+ __all__ = ['HTMLBaseLoader', 'HTMLFlatLoader', 'HTMLNestedLoader']
@@ -0,0 +1,3 @@
1
+ from .html_load_exception import HtmlLoadException as HtmlLoadException
2
+
3
+ __all__ = ['HtmlLoadException']
@@ -0,0 +1,7 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class HtmlLoadException(Exception):
4
+ """Custom exception for handling HtmlLoadException errors."""
5
+ message: Incomplete
6
+ def __init__(self, message: str) -> None:
7
+ """Initialize the HtmlLoadException."""
@@ -0,0 +1,3 @@
1
+ from .html_flat_loader import HTMLFlatLoader as HTMLFlatLoader
2
+
3
+ __all__ = ['HTMLFlatLoader']
@@ -0,0 +1,52 @@
1
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
2
+ from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
3
+ from gllm_docproc.model.element import Element as Element
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
5
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
6
+ from parsel import Selector as Selector, SelectorList as SelectorList
7
+ from typing import Callable
8
+
9
+ def is_base_element(content_selector: Selector | SelectorList[Selector] | None, removed_components: RemovedComponents) -> bool:
10
+ """Check if the given content selector represents a base element.
11
+
12
+ A base element is determined by the type of element in the HTML document.
13
+ Supported base elements include:
14
+ 1. Unsupported result (if content_selector is None)
15
+ 2. String text
16
+ 3. Removed Components (by class or tag) defined in RemovedComponents
17
+ 4. <input>
18
+ 5. <svg> image
19
+ 6. <img>
20
+ 7. <audio>, <video> (if multiple sources are given, select only the first one)
21
+ 8. <iframe> (cannot get the content of the iframe)
22
+ 9. <embed> (cannot get the content of the embed)
23
+ 10. <br>
24
+
25
+ Args:
26
+ content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
27
+ removed_components (RemovedComponents): Components to be removed from processing.
28
+
29
+ Returns:
30
+ bool: True if the content_selector represents a base element; False otherwise.
31
+ """
32
+ def handle_base_element(content_selector: Selector | SelectorList[Selector] | None, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
33
+ """Handle the base HTML element and generate Element instances.
34
+
35
+ Args:
36
+ content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
37
+ html_head (ElementMetadata): The metadata extracted from the HTML head.
38
+ removed_components (RemovedComponents): Components to be removed from processing.
39
+
40
+ Returns:
41
+ list[Element]: A list of Element instances generated from the HTML content.
42
+ """
43
+ def get_handler(tag: str) -> Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None:
44
+ """Get the handler function corresponding to the given HTML tag.
45
+
46
+ Args:
47
+ tag (str): The HTML tag for which the handler function is requested.
48
+
49
+ Returns:
50
+ Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None: The handler
51
+ function corresponding to the given HTML tag.
52
+ """
@@ -0,0 +1,30 @@
1
+ from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
2
+ from gllm_docproc.loader.html.flat.html_flat_base_handler import handle_base_element as handle_base_element, is_base_element as is_base_element
3
+ from gllm_docproc.loader.html.flat.html_flat_merger import merge_html_elements as merge_html_elements
4
+ from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
5
+ from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head, extract_html_title_tag as extract_html_title_tag
6
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
7
+ from gllm_docproc.model.element import Element as Element
8
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
9
+ from parsel import Selector as Selector, SelectorList as SelectorList
10
+
11
+ class HTMLFlatLoader(HTMLBaseLoader):
12
+ """A loader class for loading web content and extracting information.
13
+
14
+ This class inherits from the BaseLoader class and provides methods to load web content,
15
+ extract information, and scrape data using Scrapy spiders.
16
+ """
17
+ def __init__(self) -> None:
18
+ """Initialize the HTMLFlatLoader."""
19
+ @classmethod
20
+ def extract_html_element(cls, content_selector: SelectorList[Selector] | Selector, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
21
+ """Recursively extract the content of an HTML element.
22
+
23
+ Args:
24
+ content_selector (SelectorList[Selector] | Selector): The content selector.
25
+ html_head (ElementMetadata): The HTML head metadata.
26
+ removed_components (RemovedComponents): The removed components.
27
+
28
+ Returns:
29
+ list[Element]: A list of web elements.
30
+ """
@@ -0,0 +1,22 @@
1
+ from gllm_docproc.loader.html.utils.flat_table_utils import FlatTableUtils as FlatTableUtils
2
+ from gllm_docproc.model.element import Element as Element
3
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
4
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys
5
+ from parsel import Selector as Selector, SelectorList as SelectorList
6
+
7
+ def merge_html_elements(content_selector: Selector | SelectorList[Selector], contents: list[Element], html_head: ElementMetadata) -> list[Element]:
8
+ """For non-base element, add metadata and merge children into one with parent element.
9
+
10
+ 1. Add its HTML tag into its metadata
11
+ 2. For some HTML tags, combine its children into a single element into the parent, for example:
12
+ 1. Combine <ul> / <ol> children into a single element
13
+ 2. Combine <a> children to become [text](https://link.com)
14
+
15
+ Args:
16
+ content_selector(Selector | SelectorList[Selector]): The content selector representing the HTML element.
17
+ contents (list[Element]): list of Element instances representing the contents of the HTML element.
18
+ html_head (ElementMetadata): The metadata extracted from the HTML head.
19
+
20
+ Returns:
21
+ list[Element]: list of Element instances after handling the contents based on the parent tag.
22
+ """
@@ -0,0 +1,25 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.html.utils.html_utils import is_html_content as is_html_content
3
+ from typing import Any
4
+
5
+ class HTMLBaseLoader(BaseLoader):
6
+ """A loader class for loading web content and extracting information.
7
+
8
+ This class inherits from the BaseLoader class and provides methods to load web content,
9
+ extract information, and scrape data using Scrapy spiders.
10
+ """
11
+ URL_INDEX: int
12
+ CONTENT_INDEX: int
13
+ def __init__(self, load_from_html_string: Any) -> None:
14
+ """Initialize the HTMLBaseLoader."""
15
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
16
+ """Loads web content and returns the extracted information in JSON format.
17
+
18
+ Args:
19
+ source (str): The source of the web content, either a URL or a file path.
20
+ loaded_elements (list[dict]): A list of loaded elements to be processed.
21
+ **kwargs (dict[str, Any]): Additional keyword arguments.
22
+
23
+ Returns:
24
+ list[dict]: The extracted information in JSON format.
25
+ """
@@ -0,0 +1,3 @@
1
+ from .html_nested_loader import HTMLNestedLoader as HTMLNestedLoader
2
+
3
+ __all__ = ['HTMLNestedLoader']