gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +28 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +15 -0
- gllm_docproc/data_generator/__init__.pyi +5 -0
- gllm_docproc/data_generator/base_data_generator.pyi +18 -0
- gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
- gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
- gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
- gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
- gllm_docproc/downloader/__init__.pyi +5 -0
- gllm_docproc/downloader/base_downloader.pyi +19 -0
- gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
- gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
- gllm_docproc/downloader/html/__init__.pyi +7 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
- gllm_docproc/downloader/html/html_downloader.pyi +114 -0
- gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
- gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
- gllm_docproc/dpo_router/__init__.pyi +5 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
- gllm_docproc/dpo_router/loader_router.pyi +52 -0
- gllm_docproc/dpo_router/parser_router.pyi +42 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +30 -0
- gllm_docproc/indexer/graph/__init__.pyi +4 -0
- gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
- gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
- gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
- gllm_docproc/indexer/vector/__init__.pyi +3 -0
- gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/audio/audio_loader.pyi +45 -0
- gllm_docproc/loader/base_loader.pyi +30 -0
- gllm_docproc/loader/csv/__init__.pyi +3 -0
- gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +4 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/image/__init__.pyi +3 -0
- gllm_docproc/loader/image/image_loader.pyi +54 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
- gllm_docproc/loader/loader_utils.pyi +43 -0
- gllm_docproc/loader/pdf/__init__.pyi +14 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/pptx/__init__.pyi +3 -0
- gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +55 -0
- gllm_docproc/loader/video/__init__.pyi +3 -0
- gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
- gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
- gllm_docproc/model/__init__.pyi +7 -0
- gllm_docproc/model/element.pyi +38 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/model/loader_type.pyi +20 -0
- gllm_docproc/model/media.pyi +51 -0
- gllm_docproc/model/parser_type.pyi +19 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +28 -0
- gllm_docproc/parser/document/__init__.pyi +7 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/pptx_parser.pyi +34 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/image/__init__.pyi +4 -0
- gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
- gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +16 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +38 -0
- gllm_docproc/utils/__init__.pyi +3 -0
- gllm_docproc/utils/async_utils.pyi +22 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +122 -0
- gllm_docproc/validator/__init__.pyi +6 -0
- gllm_docproc/validator/base_validator.pyi +34 -0
- gllm_docproc/validator/character_count_validator.pyi +26 -0
- gllm_docproc/validator/file_size_validator.pyi +20 -0
- gllm_docproc/validator/model/__init__.pyi +4 -0
- gllm_docproc/validator/model/validator_input.pyi +50 -0
- gllm_docproc/validator/model/validator_result.pyi +19 -0
- gllm_docproc/validator/page_count_validator.pyi +23 -0
- gllm_docproc/validator/pipeline_validator.pyi +40 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cpython-311-darwin.so +0 -0
- gllm_docproc.pyi +222 -0
- gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
- gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
- gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
- gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
|
|
3
|
+
from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
|
|
4
|
+
from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
class PythonDOCXLoader(BaseLoader):
|
|
8
|
+
"""A class for loading and processing DOCX document using PythonDOCXLoader.
|
|
9
|
+
|
|
10
|
+
This class defines the structure for loading and processing DOCX document to retrieve required values
|
|
11
|
+
(Header, Body (Text and Table), Footer). It implements the 'load' method to handle DOCX loading
|
|
12
|
+
from a given file path.
|
|
13
|
+
|
|
14
|
+
PythonDOCXLoader is used to extract the Header, Body (Text and Table), Footer and metadata from the DOCX document.
|
|
15
|
+
|
|
16
|
+
Methods:
|
|
17
|
+
load(source, loaded_elements, **kwargs): Load a DOCX document.
|
|
18
|
+
"""
|
|
19
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
20
|
+
"""Load and process a DOCX document specified by the file path and name (source).
|
|
21
|
+
|
|
22
|
+
This method defines the process of loading a DOCX document using its file path.
|
|
23
|
+
It uses PythonDOCX to extract element text (with text structure) and table from the DOCX document.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The path to the DOCX document file.
|
|
27
|
+
loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
|
|
28
|
+
**kwargs (Any): Additional keyword arguments for the loader.
|
|
29
|
+
|
|
30
|
+
Kwargs:
|
|
31
|
+
original_source (str, optional): The original source of the document.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
|
|
35
|
+
"""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.docx.python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
|
|
3
|
+
from gllm_docproc.model.element import Element as Element, TABLE as TABLE
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class PythonDOCXTableLoader(BaseLoader):
|
|
7
|
+
"""Python DOCX Table Loader class to load tables from DOCX document.
|
|
8
|
+
|
|
9
|
+
This class is used to load tables from DOCX document using python-docx library.
|
|
10
|
+
Then it combined the existing loaded elements with the loaded tables.
|
|
11
|
+
|
|
12
|
+
Methods:
|
|
13
|
+
load: Load the tables from the DOCX document and combine it with the existing loaded elements.
|
|
14
|
+
_filter_table_elements: Filter the table elements from the loaded elements.
|
|
15
|
+
_get_table_content_count: Get the table content count.
|
|
16
|
+
_is_table_match: Is the table match with the merged table.
|
|
17
|
+
_find_matching_merged_table: Find the matching merged table.
|
|
18
|
+
"""
|
|
19
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
20
|
+
"""Load the tables from the DOCX document and combine it with the existing loaded elements.
|
|
21
|
+
|
|
22
|
+
This function loads the tables from the DOCX document using python-docx library.
|
|
23
|
+
Then it combined the existing loaded elements with the loaded tables.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The source file path.
|
|
27
|
+
loaded_elements (list[dict[str, Any]] | None): The existing loaded elements.
|
|
28
|
+
**kwargs (Any): The keyword arguments.
|
|
29
|
+
|
|
30
|
+
Kwargs:
|
|
31
|
+
original_source (str, optional): The original source of the document.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
list[dict[str, Any]]: The loaded elements.
|
|
35
|
+
"""
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
|
|
3
|
+
class VideoConversionError(Exception):
|
|
4
|
+
"""An exception for video conversion failures."""
|
|
5
|
+
message: Incomplete
|
|
6
|
+
def __init__(self, video_path: str, cause: str) -> None:
|
|
7
|
+
"""Initialize the exception.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
video_path (str): Path to the video file that failed to convert.
|
|
11
|
+
cause (str): Description of the underlying cause of the failure.
|
|
12
|
+
"""
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.loader.html.utils.html_utils import resolve_relative_url as resolve_relative_url
|
|
3
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
4
|
+
from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
|
|
5
|
+
from gllm_docproc.model.element import Element as Element
|
|
6
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
7
|
+
from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
|
|
8
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
9
|
+
from parsel import Selector, SelectorList
|
|
10
|
+
from typing import Callable
|
|
11
|
+
|
|
12
|
+
def setup_mingw_path_for_cairosvg() -> None:
|
|
13
|
+
"""On Windows, add mingw bin to PATH so CairoSVG can find required DLLs.
|
|
14
|
+
|
|
15
|
+
CairoSVG relies on DLLs (e.g., for font rendering and image processing) that may not be available
|
|
16
|
+
in the default PATH. Adding the mingw bin directory ensures these dependencies are found,
|
|
17
|
+
preventing import or runtime errors.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
CAIROSVG_AVAILABLE: bool
|
|
21
|
+
logger: Incomplete
|
|
22
|
+
|
|
23
|
+
def is_base_element(content_selector: Selector | SelectorList[Selector] | None, removed_components: RemovedComponents) -> bool:
|
|
24
|
+
"""Check if the given content selector represents a base element.
|
|
25
|
+
|
|
26
|
+
A base element is determined by the type of element in the HTML document.
|
|
27
|
+
Supported base elements include:
|
|
28
|
+
1. Unsupported result (if content_selector is None)
|
|
29
|
+
2. String text
|
|
30
|
+
3. Removed Components (by class or tag) defined in RemovedComponents
|
|
31
|
+
4. <input>
|
|
32
|
+
5. <svg> image
|
|
33
|
+
6. <img>
|
|
34
|
+
7. <audio>, <video> (if multiple sources are given, select only the first one)
|
|
35
|
+
8. <iframe> (cannot get the content of the iframe)
|
|
36
|
+
9. <embed> (cannot get the content of the embed)
|
|
37
|
+
10. <br>
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
|
|
41
|
+
removed_components (RemovedComponents): Components to be removed from processing.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
bool: True if the content_selector represents a base element; False otherwise.
|
|
45
|
+
"""
|
|
46
|
+
def handle_base_element(content_selector: Selector | SelectorList[Selector] | None, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
|
|
47
|
+
"""Handle the base HTML element and generate Element instances.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
content_selector (Selector | SelectorList[Selector] | None): The selector representing the HTML content.
|
|
51
|
+
html_head (ElementMetadata): The metadata extracted from the HTML head.
|
|
52
|
+
removed_components (RemovedComponents): Components to be removed from processing.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
list[Element]: A list of Element instances generated from the HTML content.
|
|
56
|
+
"""
|
|
57
|
+
def get_handler(tag: str) -> Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None:
|
|
58
|
+
"""Get the handler function corresponding to the given HTML tag.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
tag (str): The HTML tag for which the handler function is requested.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Callable[[Selector | SelectorList[Selector], ElementMetadata], list[Element]] | None: The handler
|
|
65
|
+
function corresponding to the given HTML tag.
|
|
66
|
+
"""
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
2
|
+
from gllm_docproc.loader.html.flat.html_flat_base_handler import handle_base_element as handle_base_element, is_base_element as is_base_element
|
|
3
|
+
from gllm_docproc.loader.html.flat.html_flat_merger import merge_html_elements as merge_html_elements
|
|
4
|
+
from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
|
|
5
|
+
from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head, extract_html_title_tag as extract_html_title_tag
|
|
6
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
7
|
+
from gllm_docproc.model.element import Element as Element
|
|
8
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
9
|
+
from parsel import Selector, SelectorList
|
|
10
|
+
|
|
11
|
+
class HTMLFlatLoader(HTMLBaseLoader):
|
|
12
|
+
"""A loader class for loading web content and extracting information.
|
|
13
|
+
|
|
14
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
15
|
+
extract information, and scrape data using Scrapy spiders.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
"""Initialize the HTMLFlatLoader."""
|
|
19
|
+
@classmethod
|
|
20
|
+
def extract_html_element(cls, content_selector: SelectorList[Selector] | Selector, html_head: ElementMetadata, removed_components: RemovedComponents) -> list[Element]:
|
|
21
|
+
"""Recursively extract the content of an HTML element.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
content_selector (SelectorList[Selector] | Selector): The content selector.
|
|
25
|
+
html_head (ElementMetadata): The HTML head metadata.
|
|
26
|
+
removed_components (RemovedComponents): The removed components.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list[Element]: A list of web elements.
|
|
30
|
+
"""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.utils.flat_table_utils import FlatTableUtils as FlatTableUtils
|
|
2
|
+
from gllm_docproc.loader.html.utils.html_utils import resolve_relative_url as resolve_relative_url
|
|
3
|
+
from gllm_docproc.model.element import Element as Element
|
|
4
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
|
|
5
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys
|
|
6
|
+
from parsel import Selector, SelectorList
|
|
7
|
+
|
|
8
|
+
def merge_html_elements(content_selector: Selector | SelectorList[Selector], contents: list[Element], html_head: ElementMetadata) -> list[Element]:
|
|
9
|
+
"""For non-base element, add metadata and merge children into one with parent element.
|
|
10
|
+
|
|
11
|
+
1. Add its HTML tag into its metadata
|
|
12
|
+
2. For some HTML tags, combine its children into a single element into the parent, for example:
|
|
13
|
+
1. Combine <ul> / <ol> children into a single element
|
|
14
|
+
2. Combine <a> children to become [text](https://link.com)
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
content_selector(Selector | SelectorList[Selector]): The content selector representing the HTML element.
|
|
18
|
+
contents (list[Element]): list of Element instances representing the contents of the HTML element.
|
|
19
|
+
html_head (ElementMetadata): The metadata extracted from the HTML head.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
list[Element]: list of Element instances after handling the contents based on the parent tag.
|
|
23
|
+
"""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
|
|
2
|
+
from gllm_docproc.loader.html.utils.html_utils import is_html_content as is_html_content
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
class HTMLBaseLoader(BaseLoader):
|
|
6
|
+
"""A loader class for loading web content and extracting information.
|
|
7
|
+
|
|
8
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
9
|
+
extract information, and scrape data using Scrapy spiders.
|
|
10
|
+
"""
|
|
11
|
+
URL_INDEX: int
|
|
12
|
+
CONTENT_INDEX: int
|
|
13
|
+
def __init__(self, load_from_html_string: Any) -> None:
|
|
14
|
+
"""Initialize the HTMLBaseLoader."""
|
|
15
|
+
def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
|
|
16
|
+
"""Loads web content and returns the extracted information in JSON format.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
source (str): The source of the web content, either a URL or a file path.
|
|
20
|
+
loaded_elements (list[dict]): A list of loaded elements to be processed.
|
|
21
|
+
**kwargs (dict[str, Any]): Additional keyword arguments.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list[dict]: The extracted information in JSON format.
|
|
25
|
+
"""
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
class DictionaryUtils:
|
|
2
|
+
"""A utility class providing methods to manipulate dictionaries."""
|
|
3
|
+
@staticmethod
|
|
4
|
+
def add_or_skip_value(dictionary, key, value):
|
|
5
|
+
"""Adds a value to a dictionary if the value is not None for a given key.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
dictionary (dict): The dictionary to be modified.
|
|
9
|
+
key (hashable): The key where the value needs to be added.
|
|
10
|
+
value (any): The value to be added.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict: The modified dictionary.
|
|
14
|
+
"""
|
|
15
|
+
@staticmethod
|
|
16
|
+
def append_value(dictionary, key, value):
|
|
17
|
+
"""Appends a value to a list under a specific key in a dictionary.
|
|
18
|
+
|
|
19
|
+
If the key already exists in the dictionary, the value is appended to the list under that key.
|
|
20
|
+
If the key does not exist, a new list is created with the value as its first element.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
dictionary (dict): The dictionary to be modified.
|
|
24
|
+
key (hashable): The key under which the value needs to be added.
|
|
25
|
+
value (any): The value to be appended to the list under the key.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
dict: The modified dictionary with the value appended to the list under the key.
|
|
29
|
+
"""
|
|
30
|
+
@staticmethod
|
|
31
|
+
def put_key_to_bottom(dictionary, key):
|
|
32
|
+
"""Rearange key in dictionary.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
dictionary (dict): The dictionary to be modified.
|
|
36
|
+
key (hashable): The key.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict: The modified dictionary.
|
|
40
|
+
"""
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from .html_nested_element_handler import get_element as get_element
|
|
2
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
3
|
+
from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
|
|
4
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
5
|
+
from gllm_docproc.loader.html.utils.string_utils import StringUtils as StringUtils
|
|
6
|
+
from gllm_docproc.loader.html.utils.table_utils import TableUtils as TableUtils
|
|
7
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
8
|
+
|
|
9
|
+
def get_element_content(content_selector, removed_components: RemovedComponents) -> list[dict]:
|
|
10
|
+
'''Traverses each element to get the content.
|
|
11
|
+
|
|
12
|
+
This function extract html body recursively
|
|
13
|
+
|
|
14
|
+
Input example:
|
|
15
|
+
|
|
16
|
+
.. code-block:: html
|
|
17
|
+
|
|
18
|
+
<html>
|
|
19
|
+
<head>
|
|
20
|
+
<title>Title</title>
|
|
21
|
+
</head>
|
|
22
|
+
<body>
|
|
23
|
+
<div class="container">
|
|
24
|
+
<h1>Welcome to My Website</h1>
|
|
25
|
+
<div>
|
|
26
|
+
Hello World
|
|
27
|
+
</div>
|
|
28
|
+
</div>
|
|
29
|
+
<p>This is another paragraph.</p>
|
|
30
|
+
</body>
|
|
31
|
+
</html>
|
|
32
|
+
|
|
33
|
+
Output:
|
|
34
|
+
|
|
35
|
+
.. code-block:: groovy
|
|
36
|
+
|
|
37
|
+
[
|
|
38
|
+
{
|
|
39
|
+
\'tag\': \'body\',
|
|
40
|
+
\'class\': None,
|
|
41
|
+
\'content\': [
|
|
42
|
+
{
|
|
43
|
+
\'tag\': \'div\',
|
|
44
|
+
\'class\': \'container\',
|
|
45
|
+
\'content\': [
|
|
46
|
+
{
|
|
47
|
+
\'tag\': \'h1\',
|
|
48
|
+
\'class\': None, \'content\': [
|
|
49
|
+
{
|
|
50
|
+
\'tag\': \'text\',
|
|
51
|
+
\'content\': \'Welcome to My Website\'
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
\'tag\': \'div\',
|
|
57
|
+
\'class\': None,
|
|
58
|
+
\'content\': [
|
|
59
|
+
{\'tag\': \'text\', \'content\': \'Hello World\'}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
\'tag\': \'p\',
|
|
66
|
+
\'class\': None,
|
|
67
|
+
\'content\': [
|
|
68
|
+
{\'tag\': \'text\', \'content\': \'This is another paragraph.\'}
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
content_selector: The content to be traversed.
|
|
77
|
+
removed_components: Removed class or tags.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The List of extracted contents.
|
|
81
|
+
'''
|
|
82
|
+
def is_base_element(content_selector, removed_components: RemovedComponents) -> bool:
|
|
83
|
+
"""Check if the given content selector represents a base element.
|
|
84
|
+
|
|
85
|
+
See html_flat_base_handler.py for more information.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
content_selector: The content selector to check.
|
|
89
|
+
removed_components (RemovedComponents): An instance of RemovedComponents class.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
bool: True if the content_selector represents a base element; False otherwise.
|
|
93
|
+
"""
|
|
94
|
+
def handle_base_element(content_selector, removed_components: RemovedComponents) -> list[dict]:
|
|
95
|
+
"""Handle the processing of a base HTML element.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
content_selector: The content selector representing the HTML element.
|
|
99
|
+
removed_components (RemovedComponents): An object containing information about components to be removed.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of dict : A List of dictionaries containing information about the HTML element, or None
|
|
103
|
+
if the element should be skipped.
|
|
104
|
+
- tag: HTML tag name.
|
|
105
|
+
- class: CSS class of the element (if available).
|
|
106
|
+
Additional keys may be added based on the specific element handler.
|
|
107
|
+
"""
|
|
108
|
+
def get_handler(tag: str):
|
|
109
|
+
"""Get the element information from the specified content selector.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
tag (str): The HTML tag to get the handler for.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
dict: A dictionary containing information about the HTML element.
|
|
116
|
+
- tag: HTML tag name.
|
|
117
|
+
- class: CSS class of the element (if available).
|
|
118
|
+
Additional keys may be added based on the specific element handler
|
|
119
|
+
"""
|
|
120
|
+
def create_text_dict(message):
|
|
121
|
+
"""Creates a dictionary with 'text' tag and specified content.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
message: The content to be added to the dictionary.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A dictionary with 'text' tag and specified content.
|
|
128
|
+
"""
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.nested.dictionary_utils import DictionaryUtils as DictionaryUtils
|
|
2
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, HTMLTags as HTMLTags
|
|
3
|
+
|
|
4
|
+
def get_element(content_selector) -> dict:
|
|
5
|
+
"""Get the element information from the specified content selector.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
content_selector: The content selector representing the HTML element.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
dict: A dictionary containing information about the HTML element.
|
|
12
|
+
- tag: HTML tag name.
|
|
13
|
+
- class: CSS class of the element (if available).
|
|
14
|
+
Additional keys may be added based on the specific element handler
|
|
15
|
+
"""
|
|
16
|
+
def get_handler(tag: str):
|
|
17
|
+
"""Gets the handler for the specified HTML tag.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
tag (str): The HTML tag to get the handler for.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Callable: The handler function for the specified HTML tag.
|
|
24
|
+
"""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .html_nested_base_handler import get_element_content as get_element_content
|
|
2
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
3
|
+
from gllm_docproc.loader.html.html_base_loader import HTMLBaseLoader as HTMLBaseLoader
|
|
4
|
+
from gllm_docproc.loader.html.utils.html_utils import extract_html_head as extract_html_head
|
|
5
|
+
from gllm_docproc.loader.html.utils.removed_components import RemovedComponents as RemovedComponents
|
|
6
|
+
from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, MetaDataKeys as MetaDataKeys
|
|
7
|
+
|
|
8
|
+
class HTMLNestedLoader(HTMLBaseLoader):
|
|
9
|
+
"""A loader class for loading web content and extracting information.
|
|
10
|
+
|
|
11
|
+
This class inherits from the BaseLoader class and provides methods to load web content,
|
|
12
|
+
extract information, and scrape data using Scrapy spiders.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
"""Initialize the HTMLNestedLoader."""
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.model.element import Element as Element
|
|
3
|
+
from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, TableConstants as TableConstants
|
|
4
|
+
|
|
5
|
+
class FlatTableUtils:
|
|
6
|
+
"""A utility class providing methods for extracting data from HTML tables."""
|
|
7
|
+
colcount: int
|
|
8
|
+
rowspans: Incomplete
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
"""Initialize the FlatTableUtils."""
|
|
11
|
+
def generate_tables(self, content: list[Element]) -> list[list[str]]:
|
|
12
|
+
"""Generate tables from HTML content.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
content (List[Element]): The list of Element instances representing the HTML content.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
List[List[str]]: A list containing the generated tables.
|
|
19
|
+
"""
|
|
20
|
+
def filter_table(self, table_content: list[Element]) -> tuple[list[Element], list[Element]]:
|
|
21
|
+
"""Filter the HTML table content.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
table_content (List[Element]): The list of Element instances representing the HTML table.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
tuple[List[Element], List[Element]]: A tuple containing the filtered table content
|
|
28
|
+
and the removed elements.
|
|
29
|
+
"""
|
|
30
|
+
def find_and_update_table_media(self, table_element: Element, elements: list[Element]) -> tuple[Element, list[Element]]:
|
|
31
|
+
"""Find images in the table and return updated table element and found images.
|
|
32
|
+
|
|
33
|
+
Instead of modifying the input table_element directly, this function creates a copy and returns
|
|
34
|
+
both the updated table element and the list of images found in the table.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
table_element (Element): The table element.
|
|
38
|
+
elements (list[Element]): The list of elements.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
tuple[Element, list[Element]]: A tuple containing:
|
|
42
|
+
- Updated copy of the table element with media metadata
|
|
43
|
+
- List of images found in the table
|
|
44
|
+
"""
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from gllm_docproc.loader.html.exception import HtmlLoadException as HtmlLoadException
|
|
2
|
+
from gllm_docproc.model.element import Element as Element
|
|
3
|
+
from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
|
|
4
|
+
from gllm_docproc.utils.html_constants import HTMLTags as HTMLTags, MetaDataKeys as MetaDataKeys
|
|
5
|
+
from scrapy.http import HtmlResponse
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
def is_html_content(content: str) -> bool:
|
|
9
|
+
'''Check if the provided content appears to be HTML.
|
|
10
|
+
|
|
11
|
+
This function performs a case-insensitive check to determine if the content contains HTML tags,
|
|
12
|
+
specifically by searching for the opening and closing HTML tags ("<html" and "</html>").
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
content (str): The content to check.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
bool: True if the content is identified as HTML; False otherwise.
|
|
19
|
+
'''
|
|
20
|
+
def extract_html_head(response: HtmlResponse, element_metadata: dict[str, Any] | None) -> ElementMetadata:
|
|
21
|
+
"""Extracts metadata from an HTML response.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
response (HtmlResponse): The HTML response.
|
|
25
|
+
element_metadata (dict[str, Any] | None): The element metadata.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
ElementMetadata: A class containing element metadata.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
HtmlLoadException: If an error occurs during the extraction process.
|
|
32
|
+
"""
|
|
33
|
+
def extract_html_title_tag(metadata: ElementMetadata) -> list[Element]:
|
|
34
|
+
"""Gets the title element as a Element.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
metadata (ElementMetadata): A class containing element metadata.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List[Element]: List containing a single Element instance representing the title element.
|
|
41
|
+
"""
|
|
42
|
+
def resolve_relative_url(url: str, base_url: str) -> str:
|
|
43
|
+
'''Resolve a relative URL against a base URL.
|
|
44
|
+
|
|
45
|
+
This function handles various relative URL patterns:
|
|
46
|
+
1. Relative paths: "image.png", "folder/image.png"
|
|
47
|
+
2. Parent directory: "../image.png"
|
|
48
|
+
3. Absolute paths: "/image.png"
|
|
49
|
+
4. Already absolute URLs are returned as-is
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
url (str): The URL to resolve (may be relative or absolute).
|
|
53
|
+
base_url (str): The base URL to resolve against (can be a full path like
|
|
54
|
+
"http://example.com/articles/news/news_id").
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
str: The resolved absolute URL, or the original url if it\'s already absolute
|
|
58
|
+
or base_url is invalid.
|
|
59
|
+
'''
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class RemovedComponents:
|
|
2
|
+
"""Class representing removed components from a document.
|
|
3
|
+
|
|
4
|
+
This class defines three methods for retrieving partial class, full class, and HTML tags
|
|
5
|
+
associated with removed components.
|
|
6
|
+
"""
|
|
7
|
+
def get_partial_class(self) -> list[str]:
|
|
8
|
+
"""Get partial class.
|
|
9
|
+
|
|
10
|
+
Method to get the partial class of the removed component. Partial class consists of
|
|
11
|
+
classes that will be filtered.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
str: The partial class name associated with the removed component.
|
|
15
|
+
"""
|
|
16
|
+
def get_full_class(self) -> list[str]:
|
|
17
|
+
"""Get full class.
|
|
18
|
+
|
|
19
|
+
Method to get the full class of the removed component. Full class consists of
|
|
20
|
+
exact match of classes that will be filtered.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str: The full class name associated with the removed component.
|
|
24
|
+
"""
|
|
25
|
+
def get_html_tags(self) -> list[str]:
|
|
26
|
+
"""Method to get the HTML tags associated with the removed component.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list: A list of HTML tags associated with the removed component.
|
|
30
|
+
"""
|
|
31
|
+
@staticmethod
|
|
32
|
+
def is_removed_component(tag: str | None, class_: str | None, removed_components: RemovedComponents | None) -> bool:
|
|
33
|
+
"""Checks if a component should be removed based on its tag and class.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
tag (str): The tag of the component.
|
|
37
|
+
class_ (str): The class of the component.
|
|
38
|
+
removed_components (RemovedComponents): The components to be removed, including HTML tags and classes.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
True if the component should be removed, False otherwise.
|
|
42
|
+
"""
|
|
43
|
+
@staticmethod
|
|
44
|
+
def check_list_in_substring(message: str, check_list: list[str]) -> bool:
|
|
45
|
+
"""Checks if any substring from the check_list exists in the message string.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
message (str): The string to search for substrings.
|
|
49
|
+
check_list (list): A list of substrings to be checked.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
- bool: True if any substring from check_list is found in message, otherwise False.
|
|
53
|
+
"""
|