gllm-docproc-binary 0.7.22__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (167) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +6 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  28. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  29. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  38. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  39. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  40. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  41. gllm_docproc/dpo_router/__init__.pyi +5 -0
  42. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  43. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  44. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  45. gllm_docproc/housekeeping/__init__.pyi +3 -0
  46. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  47. gllm_docproc/indexer/__init__.pyi +3 -0
  48. gllm_docproc/indexer/base_indexer.pyi +30 -0
  49. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  50. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  51. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  52. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  53. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  54. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  55. gllm_docproc/loader/__init__.pyi +4 -0
  56. gllm_docproc/loader/audio/__init__.pyi +3 -0
  57. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  58. gllm_docproc/loader/base_loader.pyi +30 -0
  59. gllm_docproc/loader/csv/__init__.pyi +3 -0
  60. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  61. gllm_docproc/loader/docx/__init__.pyi +5 -0
  62. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  63. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  64. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  65. gllm_docproc/loader/exception/__init__.pyi +4 -0
  66. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  67. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  68. gllm_docproc/loader/html/__init__.pyi +5 -0
  69. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  70. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  71. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  72. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
  73. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  74. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  75. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  76. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  77. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  78. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  79. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  80. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  81. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  82. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  83. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  84. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  85. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  86. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  87. gllm_docproc/loader/image/__init__.pyi +3 -0
  88. gllm_docproc/loader/image/image_loader.pyi +54 -0
  89. gllm_docproc/loader/json/__init__.pyi +3 -0
  90. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  91. gllm_docproc/loader/loader_utils.pyi +43 -0
  92. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  93. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  94. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  96. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  97. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  98. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  99. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  100. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  101. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  102. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  103. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  104. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  105. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  106. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  107. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  108. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  109. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  110. gllm_docproc/loader/txt/__init__.pyi +3 -0
  111. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  112. gllm_docproc/loader/video/__init__.pyi +3 -0
  113. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  114. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  115. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  116. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  117. gllm_docproc/model/__init__.pyi +7 -0
  118. gllm_docproc/model/element.pyi +38 -0
  119. gllm_docproc/model/element_metadata.pyi +35 -0
  120. gllm_docproc/model/loader_type.pyi +20 -0
  121. gllm_docproc/model/media.pyi +51 -0
  122. gllm_docproc/model/parser_type.pyi +19 -0
  123. gllm_docproc/parser/__init__.pyi +4 -0
  124. gllm_docproc/parser/base_parser.pyi +28 -0
  125. gllm_docproc/parser/document/__init__.pyi +7 -0
  126. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  127. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  128. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  129. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  130. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  131. gllm_docproc/parser/html/__init__.pyi +4 -0
  132. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  133. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  134. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  135. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  136. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  137. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  138. gllm_docproc/parser/image/__init__.pyi +4 -0
  139. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  140. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  141. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  142. gllm_docproc/parser/table/__init__.pyi +3 -0
  143. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  144. gllm_docproc/request_handler/__init__.pyi +3 -0
  145. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  146. gllm_docproc/response_handler/__init__.pyi +3 -0
  147. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  148. gllm_docproc/utils/__init__.pyi +3 -0
  149. gllm_docproc/utils/async_utils.pyi +22 -0
  150. gllm_docproc/utils/file_utils.pyi +76 -0
  151. gllm_docproc/utils/html_constants.pyi +122 -0
  152. gllm_docproc/validator/__init__.pyi +6 -0
  153. gllm_docproc/validator/base_validator.pyi +34 -0
  154. gllm_docproc/validator/character_count_validator.pyi +26 -0
  155. gllm_docproc/validator/file_size_validator.pyi +20 -0
  156. gllm_docproc/validator/model/__init__.pyi +4 -0
  157. gllm_docproc/validator/model/validator_input.pyi +50 -0
  158. gllm_docproc/validator/model/validator_result.pyi +19 -0
  159. gllm_docproc/validator/page_count_validator.pyi +23 -0
  160. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  161. gllm_docproc.build/.gitignore +1 -0
  162. gllm_docproc.cp311-win_amd64.pyd +0 -0
  163. gllm_docproc.pyi +220 -0
  164. gllm_docproc_binary-0.7.22.dist-info/METADATA +216 -0
  165. gllm_docproc_binary-0.7.22.dist-info/RECORD +167 -0
  166. gllm_docproc_binary-0.7.22.dist-info/WHEEL +5 -0
  167. gllm_docproc_binary-0.7.22.dist-info/top_level.txt +1 -0
@@ -0,0 +1,12 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class VideoConversionError(Exception):
4
+ """An exception for video conversion failures."""
5
+ message: Incomplete
6
+ def __init__(self, video_path: str, cause: str) -> None:
7
+ """Initialize the exception.
8
+
9
+ Args:
10
+ video_path (str): Path to the video file that failed to convert.
11
+ cause (str): Description of the underlying cause of the failure.
12
+ """
@@ -0,0 +1,5 @@
1
+ from .flat import HTMLFlatLoader as HTMLFlatLoader
2
+ from .html_base_loader import HTMLBaseLoader as HTMLBaseLoader
3
+ from .nested import HTMLNestedLoader as HTMLNestedLoader
4
+
5
+ __all__ = ['HTMLBaseLoader', 'HTMLFlatLoader', 'HTMLNestedLoader']
@@ -0,0 +1,3 @@
1
+ from .html_load_exception import HtmlLoadException as HtmlLoadException
2
+
3
+ __all__ = ['HtmlLoadException']
@@ -0,0 +1,7 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class HtmlLoadException(Exception):
4
+ """Custom exception for handling HtmlLoadException errors."""
5
+ message: Incomplete
6
+ def __init__(self, message: str) -> None:
7
+ """Initialize the HtmlLoadException."""
@@ -0,0 +1,3 @@
1
+ from .html_flat_loader import HTMLFlatLoader as HTMLFlatLoader
2
+
3
+ __all__ = ['HTMLFlatLoader']
@@ -0,0 +1,65 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
3
+ from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
4
+ from gllm_docproc.model.element import Element as Element
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
6
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
7
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
8
+ from parsel import Selector, SelectorList
9
+ from typing import Callable
10
+
11
+ def setup_mingw_path_for_cairosvg() -> None:
12
+ """On Windows, add mingw bin to PATH so CairoSVG can find required DLLs.
13
+
14
+ CairoSVG relies on DLLs (e.g., for font rendering and image processing) that may not be available
15
+ in the default PATH. Adding the mingw bin directory ensures these dependencies are found,
16
+ preventing import or runtime errors.
17
+ """
18
+
19
+ CAIROSVG_AVAILABLE: bool
20
+ logger: Incomplete
21
+
22
+ def is_base_element(content_selector: Selector | SelectorList[Selector] | None, removed_components: RemovedComponents) -> bool:
23
+ """Check if the given content selector represents a base element.
24
+
25
+ A base element is determined by the type of element in the HTML document.
26
+ Supported base elements include:
27
+ 1. Unsupported result (if content_selector is None)
28
+ 2. String text
29
+ 3. Removed Components (by class or tag) defined in RemovedComponents
30
+ 4. <input>
31
+ 5. <svg> image
32
+ 6. <img>
33
+ 7. <audio>, <video> (if multiple sources are given, select only the first one)
34
+ 8. <iframe> (cannot get the content of the iframe)
35
+ 9. <embed> (cannot get the content of the embed)
36
+ 10. <br>
37
+
38
+ Args:
39
+ content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
40
+ removed_components (RemovedComponents): Components to be removed from processing.
41
+
42
+ Returns:
43
+ bool: True if the content_selector represents a base element; False otherwise.
44
+ """
45
+ def handle_base_element(content_selector: Selector | SelectorList[Selector] | None, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
46
+ """Handle the base HTML element and generate Element instances.
47
+
48
+ Args:
49
+ content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
50
+ html_head (ElementMetadata): The metadata extracted from the HTML head.
51
+ removed_components (RemovedComponents): Components to be removed from processing.
52
+
53
+ Returns:
54
+ list[Element]: A list of Element instances generated from the HTML content.
55
+ """
56
+ def get_handler(tag: str) -> Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None:
57
+ """Get the handler function corresponding to the given HTML tag.
58
+
59
+ Args:
60
+ tag (str): The HTML tag for which the handler function is requested.
61
+
62
+ Returns:
63
+ Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None: The handler
64
+ function corresponding to the given HTML tag.
65
+ """
@@ -0,0 +1,30 @@
1
+ from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
2
+ from gllm_docproc.loader.html.flat.html_flat_base_handler import handle_base_element as handle_base_element, is_base_element as is_base_element
3
+ from gllm_docproc.loader.html.flat.html_flat_merger import merge_html_elements as merge_html_elements
4
+ from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
5
+ from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head, extract_html_title_tag as extract_html_title_tag
6
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
7
+ from gllm_docproc.model.element import Element as Element
8
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
9
+ from parsel import Selector, SelectorList
10
+
11
+ class HTMLFlatLoader(HTMLBaseLoader):
12
+ """A loader class for loading web content and extracting information.
13
+
14
+ This class inherits from the BaseLoader class and provides methods to load web content,
15
+ extract information, and scrape data using Scrapy spiders.
16
+ """
17
+ def __init__(self) -> None:
18
+ """Initialize the HTMLFlatLoader."""
19
+ @classmethod
20
+ def extract_html_element(cls, content_selector: SelectorList[Selector] | Selector, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
21
+ """Recursively extract the content of an HTML element.
22
+
23
+ Args:
24
+ content_selector (SelectorList[Selector] | Selector): The content selector.
25
+ html_head (ElementMetadata): The HTML head metadata.
26
+ removed_components (RemovedComponents): The removed components.
27
+
28
+ Returns:
29
+ list[Element]: A list of web elements.
30
+ """
@@ -0,0 +1,22 @@
1
+ from gllm_docproc.loader.html.utils.flat_table_utils import FlatTableUtils as FlatTableUtils
2
+ from gllm_docproc.model.element import Element as Element
3
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
4
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys
5
+ from parsel import Selector, SelectorList
6
+
7
+ def merge_html_elements(content_selector: Selector | SelectorList[Selector], contents: list[Element], html_head: ElementMetadata) -> list[Element]:
8
+ """For non-base element, add metadata and merge children into one with parent element.
9
+
10
+ 1. Add its HTML tag into its metadata
11
+ 2. For some HTML tags, combine its children into a single element into the parent, for example:
12
+ 1. Combine <ul> / <ol> children into a single element
13
+ 2. Combine <a> children to become [text](https://link.com)
14
+
15
+ Args:
16
+ content_selector(Selector | SelectorList[Selector]): The content selector representing the HTML element.
17
+ contents (list[Element]): list of Element instances representing the contents of the HTML element.
18
+ html_head (ElementMetadata): The metadata extracted from the HTML head.
19
+
20
+ Returns:
21
+ list[Element]: list of Element instances after handling the contents based on the parent tag.
22
+ """
@@ -0,0 +1,25 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.html.utils.html_utils import is_html_content as is_html_content
3
+ from typing import Any
4
+
5
+ class HTMLBaseLoader(BaseLoader):
6
+ """A loader class for loading web content and extracting information.
7
+
8
+ This class inherits from the BaseLoader class and provides methods to load web content,
9
+ extract information, and scrape data using Scrapy spiders.
10
+ """
11
+ URL_INDEX: int
12
+ CONTENT_INDEX: int
13
+ def __init__(self, load_from_html_string: Any) -> None:
14
+ """Initialize the HTMLBaseLoader."""
15
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
16
+ """Loads web content and returns the extracted information in JSON format.
17
+
18
+ Args:
19
+ source (str): The source of the web content, either a URL or a file path.
20
+ loaded_elements (list[dict]): A list of loaded elements to be processed.
21
+ **kwargs (dict[str, Any]): Additional keyword arguments.
22
+
23
+ Returns:
24
+ list[dict]: The extracted information in JSON format.
25
+ """
@@ -0,0 +1,3 @@
1
+ from .html_nested_loader import HTMLNestedLoader as HTMLNestedLoader
2
+
3
+ __all__ = ['HTMLNestedLoader']
@@ -0,0 +1,40 @@
1
+ class DictionaryUtils:
2
+ """A utility class providing methods to manipulate dictionaries."""
3
+ @staticmethod
4
+ def add_or_skip_value(dictionary, key, value):
5
+ """Adds a value to a dictionary if the value is not None for a given key.
6
+
7
+ Args:
8
+ dictionary (dict): The dictionary to be modified.
9
+ key (hashable): The key where the value needs to be added.
10
+ value (any): The value to be added.
11
+
12
+ Returns:
13
+ dict: The modified dictionary.
14
+ """
15
+ @staticmethod
16
+ def append_value(dictionary, key, value):
17
+ """Appends a value to a list under a specific key in a dictionary.
18
+
19
+ If the key already exists in the dictionary, the value is appended to the list under that key.
20
+ If the key does not exist, a new list is created with the value as its first element.
21
+
22
+ Args:
23
+ dictionary (dict): The dictionary to be modified.
24
+ key (hashable): The key under which the value needs to be added.
25
+ value (any): The value to be appended to the list under the key.
26
+
27
+ Returns:
28
+ dict: The modified dictionary with the value appended to the list under the key.
29
+ """
30
+ @staticmethod
31
+ def put_key_to_bottom(dictionary, key):
32
+ """Rearange key in dictionary.
33
+
34
+ Args:
35
+ dictionary (dict): The dictionary to be modified.
36
+ key (hashable): The key.
37
+
38
+ Returns:
39
+ dict: The modified dictionary.
40
+ """
@@ -0,0 +1,128 @@
1
+ from .html_nested_element_handler import get_element as get_element
2
+ from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
3
+ from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
4
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
5
+ from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
6
+ from gllm_docproc.loader.html.utils.table_utils import TableUtils as TableUtils
7
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
8
+
9
+ def get_element_content(content_selector, removed_components: RemovedComponents) -> list[dict]:
10
+ '''Traverses each element to get the content.
11
+
12
+ This function extract html body recursively
13
+
14
+ Input example:
15
+
16
+ .. code-block:: html
17
+
18
+ <html>
19
+ <head>
20
+ <title>Title</title>
21
+ </head>
22
+ <body>
23
+ <div class="container">
24
+ <h1>Welcome to My Website</h1>
25
+ <div>
26
+ Hello World
27
+ </div>
28
+ </div>
29
+ <p>This is another paragraph.</p>
30
+ </body>
31
+ </html>
32
+
33
+ Output:
34
+
35
+ .. code-block:: groovy
36
+
37
+ [
38
+ {
39
+ \'tag\': \'body\',
40
+ \'class\': None,
41
+ \'content\': [
42
+ {
43
+ \'tag\': \'div\',
44
+ \'class\': \'container\',
45
+ \'content\': [
46
+ {
47
+ \'tag\': \'h1\',
48
+ \'class\': None, \'content\': [
49
+ {
50
+ \'tag\': \'text\',
51
+ \'content\': \'Welcome to My Website\'
52
+ }
53
+ ]
54
+ },
55
+ {
56
+ \'tag\': \'div\',
57
+ \'class\': None,
58
+ \'content\': [
59
+ {\'tag\': \'text\', \'content\': \'Hello World\'}
60
+ ]
61
+ }
62
+ ]
63
+ },
64
+ {
65
+ \'tag\': \'p\',
66
+ \'class\': None,
67
+ \'content\': [
68
+ {\'tag\': \'text\', \'content\': \'This is another paragraph.\'}
69
+ ]
70
+ }
71
+ ]
72
+ }
73
+ ]
74
+
75
+ Args:
76
+ content_selector: The content to be traversed.
77
+ removed_components: Removed class or tags.
78
+
79
+ Returns:
80
+ The List of extracted contents.
81
+ '''
82
+ def is_base_element(content_selector, removed_components: RemovedComponents) -> bool:
83
+ """Check if the given content selector represents a base element.
84
+
85
+ See html_flat_base_handler.py for more information.
86
+
87
+ Args:
88
+ content_selector: The content selector to check.
89
+ removed_components (RemovedComponents): An instance of RemovedComponents class.
90
+
91
+ Returns:
92
+ bool: True if the content_selector represents a base element; False otherwise.
93
+ """
94
+ def handle_base_element(content_selector, removed_components: RemovedComponents) -> list[dict]:
95
+ """Handle the processing of a base HTML element.
96
+
97
+ Args:
98
+ content_selector: The content selector representing the HTML element.
99
+ removed_components (RemovedComponents): An object containing information about components to be removed.
100
+
101
+ Returns:
102
+ List of dict : A List of dictionaries containing information about the HTML element, or None
103
+ if the element should be skipped.
104
+ - tag: HTML tag name.
105
+ - class: CSS class of the element (if available).
106
+ Additional keys may be added based on the specific element handler.
107
+ """
108
+ def get_handler(tag: str):
109
+ """Get the element information from the specified content selector.
110
+
111
+ Args:
112
+ tag (str): The HTML tag to get the handler for.
113
+
114
+ Returns:
115
+ dict: A dictionary containing information about the HTML element.
116
+ - tag: HTML tag name.
117
+ - class: CSS class of the element (if available).
118
+ Additional keys may be added based on the specific element handler
119
+ """
120
+ def create_text_dict(message):
121
+ """Creates a dictionary with 'text' tag and specified content.
122
+
123
+ Args:
124
+ message: The content to be added to the dictionary.
125
+
126
+ Returns:
127
+ A dictionary with 'text' tag and specified content.
128
+ """
@@ -0,0 +1,24 @@
1
+ from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
2
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
3
+
4
+ def get_element(content_selector) -> dict:
5
+ """Get the element information from the specified content selector.
6
+
7
+ Args:
8
+ content_selector: The content selector representing the HTML element.
9
+
10
+ Returns:
11
+ dict: A dictionary containing information about the HTML element.
12
+ - tag: HTML tag name.
13
+ - class: CSS class of the element (if available).
14
+ Additional keys may be added based on the specific element handler
15
+ """
16
+ def get_handler(tag: str):
17
+ """Gets the handler for the specified HTML tag.
18
+
19
+ Args:
20
+ tag (str): The HTML tag to get the handler for.
21
+
22
+ Returns:
23
+ Callable: The handler function for the specified HTML tag.
24
+ """
@@ -0,0 +1,15 @@
1
+ from .html_nested_base_handler import get_element_content as get_element_content
2
+ from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
3
+ from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
4
+ from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head
5
+ from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
6
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, MetaDataKeys as MetaDataKeys
7
+
8
+ class HTMLNestedLoader(HTMLBaseLoader):
9
+ """A loader class for loading web content and extracting information.
10
+
11
+ This class inherits from the BaseLoader class and provides methods to load web content,
12
+ extract information, and scrape data using Scrapy spiders.
13
+ """
14
+ def __init__(self) -> None:
15
+ """Initialize the HTMLNestedLoader."""
File without changes
@@ -0,0 +1,44 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import Element as Element
3
+ from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, TableConstants as TableConstants
4
+
5
+ class FlatTableUtils:
6
+ """A utility class providing methods for extracting data from HTML tables."""
7
+ colcount: int
8
+ rowspans: Incomplete
9
+ def __init__(self) -> None:
10
+ """Initialize the FlatTableUtils."""
11
+ def generate_tables(self, content: list[Element]) -> list[list[str]]:
12
+ """Generate tables from HTML content.
13
+
14
+ Args:
15
+ content (List[Element]): The list of Element instances representing the HTML content.
16
+
17
+ Returns:
18
+ List[List[str]]: A list containing the generated tables.
19
+ """
20
+ def filter_table(self, table_content: list[Element]) -> tuple[list[Element], list[Element]]:
21
+ """Filter the HTML table content.
22
+
23
+ Args:
24
+ table_content (List[Element]): The list of Element instances representing the HTML table.
25
+
26
+ Returns:
27
+ tuple[List[Element], List[Element]]: A tuple containing the filtered table content
28
+ and the removed elements.
29
+ """
30
+ def find_and_update_table_media(self, table_element: Element, elements: list[Element]) -> tuple[Element, list[Element]]:
31
+ """Find images in the table and return updated table element and found images.
32
+
33
+ Instead of modifying the input table_element directly, this function creates a copy and returns
34
+ both the updated table element and the list of images found in the table.
35
+
36
+ Args:
37
+ table_element (Element): The table element.
38
+ elements (list[Element]): The list of elements.
39
+
40
+ Returns:
41
+ tuple[Element, list[Element]]: A tuple containing:
42
+ - Updated copy of the table element with media metadata
43
+ - List of images found in the table
44
+ """
@@ -0,0 +1,41 @@
1
+ from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
2
+ from gllm_docproc.model.element import Element as Element
3
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
4
+ from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, MetaDataKeys as MetaDataKeys
5
+ from scrapy.http import HtmlResponse
6
+ from typing import Any
7
+
8
+ def is_html_content(content: str) -> bool:
9
+ '''Check if the provided content appears to be HTML.
10
+
11
+ This function performs a case-insensitive check to determine if the content contains HTML tags,
12
+ specifically by searching for the opening and closing HTML tags ("<html" and "</html>").
13
+
14
+ Args:
15
+ content (str): The content to check.
16
+
17
+ Returns:
18
+ bool: True if the content is identified as HTML; False otherwise.
19
+ '''
20
+ def extract_html_head(response: HtmlResponse, element_metadata: dict[str, Any] | None) -> ElementMetadata:
21
+ """Extracts metadata from an HTML response.
22
+
23
+ Args:
24
+ response (HtmlResponse): The HTML response.
25
+ element_metadata (dict[str, Any] | None): The element metadata.
26
+
27
+ Returns:
28
+ ElementMetadata: A class containing element metadata.
29
+
30
+ Raises:
31
+ HtmlLoadException: If an error occurs during the extraction process.
32
+ """
33
+ def extract_html_title_tag(metadata: ElementMetadata) -> list[Element]:
34
+ """Gets the title element as a Element.
35
+
36
+ Args:
37
+ metadata (ElementMetadata): A class containing element metadata.
38
+
39
+ Returns:
40
+ List[Element]: List containing a single Element instance representing the title element.
41
+ """
@@ -0,0 +1,53 @@
1
+ class RemovedComponents:
2
+ """Class representing removed components from a document.
3
+
4
+ This class defines three methods for retrieving partial class, full class, and HTML tags
5
+ associated with removed components.
6
+ """
7
+ def get_partial_class(self) -> list[str]:
8
+ """Get partial class.
9
+
10
+ Method to get the partial class of the removed component. Partial class consists of
11
+ classes that will be filtered.
12
+
13
+ Returns:
14
+ str: The partial class name associated with the removed component.
15
+ """
16
+ def get_full_class(self) -> list[str]:
17
+ """Get full class.
18
+
19
+ Method to get the full class of the removed component. Full class consists of
20
+ exact match of classes that will be filtered.
21
+
22
+ Returns:
23
+ str: The full class name associated with the removed component.
24
+ """
25
+ def get_html_tags(self) -> list[str]:
26
+ """Method to get the HTML tags associated with the removed component.
27
+
28
+ Returns:
29
+ list: A list of HTML tags associated with the removed component.
30
+ """
31
+ @staticmethod
32
+ def is_removed_component(tag: str | None, class_: str | None, removed_components: RemovedComponents | None) -> bool:
33
+ """Checks if a component should be removed based on its tag and class.
34
+
35
+ Args:
36
+ tag (str): The tag of the component.
37
+ class_ (str): The class of the component.
38
+ removed_components (RemovedComponents): The components to be removed, including HTML tags and classes.
39
+
40
+ Returns:
41
+ True if the component should be removed, False otherwise.
42
+ """
43
+ @staticmethod
44
+ def check_list_in_substring(message: str, check_list: list[str]) -> bool:
45
+ """Checks if any substring from the check_list exists in the message string.
46
+
47
+ Args:
48
+ message (str): The string to search for substrings.
49
+ check_list (list): A list of substrings to be checked.
50
+
51
+ Returns:
52
+ - bool: True if any substring from check_list is found in message, otherwise False.
53
+ """
@@ -0,0 +1,33 @@
1
+ class StringUtils:
2
+ """A utility class providing methods for text cleaning."""
3
+ @staticmethod
4
+ def clean_text(text: str | None) -> str:
5
+ """Clean the input text by removing extra whitespace, newlines, and tabs.
6
+
7
+ Args:
8
+ text (str): The text to be cleaned.
9
+
10
+ Returns:
11
+ str: The cleaned text.
12
+ """
13
+ @staticmethod
14
+ def remove_extension(file_name: str) -> str:
15
+ """Removes the file extension from a given file name.
16
+
17
+ Args:
18
+ file_name (str): The name of the file from which the extension will be removed.
19
+
20
+ Returns:
21
+ str: File name without the extension.
22
+ """
23
+ @staticmethod
24
+ def append_character(text: str, new_char: str) -> str:
25
+ """Appends a character to the end of a string, handling newline endings.
26
+
27
+ Args:
28
+ text (str): The input text string to which the character will be appended.
29
+ new_char (str): The character to append to the text.
30
+
31
+ Returns:
32
+ str: The modified string with the appended character.
33
+ """
@@ -0,0 +1,78 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
3
+ from gllm_docproc.utils.html_constants import TableConstants as TableConstants
4
+ from typing import Any
5
+
6
+ class TableUtils:
7
+ """A utility class providing methods for extracting data from HTML tables."""
8
+ colcount: int
9
+ table_selector: Incomplete
10
+ rowspans: Incomplete
11
+ def __init__(self, table_selector) -> None:
12
+ """Initialize TableUtils with the given table selector.
13
+
14
+ Args:
15
+ table_selector: Selector for the HTML table.
16
+ """
17
+ def get_table(self) -> dict[str, Any]:
18
+ """Extract data from the HTML table and return it as a dictionary."""
19
+ def extract_table(self):
20
+ """Extract data from the HTML table and return it as a list of lists representing the table structure."""
21
+ def extract_table_row_type(self):
22
+ """Extract metadata from the HTML table and return it as a list of strings representing the row types."""
23
+ def update_col_count(self, row_cells, prev_rowspans) -> None:
24
+ """Update the number of columns in the table.
25
+
26
+ Args:
27
+ row_cells: List of HTML cells in a row.
28
+ prev_rowspans: List of previous rowspans.
29
+ """
30
+ def get_row_type(self, row):
31
+ """Get the type of the row.
32
+
33
+ Args:
34
+ row: HTML row.
35
+ """
36
+ def extract_max_char_count(self, table):
37
+ """Extract maximum character count.
38
+
39
+ Extract metadata from the HTML table and return it as a list of integers representing
40
+ the maximum number of characters in each column.
41
+
42
+ Args:
43
+ table: List of lists representing the table structure.
44
+
45
+ Returns:
46
+ list: A list of integers representing the maximum number of characters in each column.
47
+ """
48
+ @staticmethod
49
+ def convert_to_texts(table) -> list[str]:
50
+ """Convert to texts.
51
+
52
+ This method processes table content by iterating over its metadata, handling each row based
53
+ on its type, and appending the result to the table data.
54
+
55
+ Args:
56
+ table: table which will be converted to text
57
+
58
+ Returns:
59
+ list: A list of dictionaries containing the extracted table data.
60
+ """
61
+ @staticmethod
62
+ def print_row(row, col_size=None):
63
+ """Formats a table row.
64
+
65
+ Args:
66
+ row (list): The row to be formatted.
67
+ col_size (list | None): List of max characters size in each column
68
+
69
+ Returns:
70
+ str: The formatted row.
71
+ """
72
+ @staticmethod
73
+ def print_table_separator(row):
74
+ """Formats a table separator.
75
+
76
+ Returns:
77
+ str: The formatted table separator.
78
+ """
@@ -0,0 +1,3 @@
1
+ from .image_loader import ImageLoader as ImageLoader
2
+
3
+ __all__ = ['ImageLoader']