gllm-docproc-binary 0.7.22__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +28 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +15 -0
- gllm_docproc/data_generator/__init__.pyi +5 -0
- gllm_docproc/data_generator/base_data_generator.pyi +18 -0
- gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
- gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
- gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
- gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
- gllm_docproc/downloader/__init__.pyi +5 -0
- gllm_docproc/downloader/base_downloader.pyi +19 -0
- gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
- gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
- gllm_docproc/downloader/html/__init__.pyi +6 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
- gllm_docproc/downloader/html/html_downloader.pyi +114 -0
- gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
- gllm_docproc/dpo_router/__init__.pyi +5 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
- gllm_docproc/dpo_router/loader_router.pyi +52 -0
- gllm_docproc/dpo_router/parser_router.pyi +42 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +30 -0
- gllm_docproc/indexer/graph/__init__.pyi +4 -0
- gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
- gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
- gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
- gllm_docproc/indexer/vector/__init__.pyi +3 -0
- gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/audio/audio_loader.pyi +45 -0
- gllm_docproc/loader/base_loader.pyi +30 -0
- gllm_docproc/loader/csv/__init__.pyi +3 -0
- gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +4 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/image/__init__.pyi +3 -0
- gllm_docproc/loader/image/image_loader.pyi +54 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
- gllm_docproc/loader/loader_utils.pyi +43 -0
- gllm_docproc/loader/pdf/__init__.pyi +14 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/pptx/__init__.pyi +3 -0
- gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +55 -0
- gllm_docproc/loader/video/__init__.pyi +3 -0
- gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
- gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
- gllm_docproc/model/__init__.pyi +7 -0
- gllm_docproc/model/element.pyi +38 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/model/loader_type.pyi +20 -0
- gllm_docproc/model/media.pyi +51 -0
- gllm_docproc/model/parser_type.pyi +19 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +28 -0
- gllm_docproc/parser/document/__init__.pyi +7 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/pptx_parser.pyi +34 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/image/__init__.pyi +4 -0
- gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
- gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +16 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +38 -0
- gllm_docproc/utils/__init__.pyi +3 -0
- gllm_docproc/utils/async_utils.pyi +22 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +122 -0
- gllm_docproc/validator/__init__.pyi +6 -0
- gllm_docproc/validator/base_validator.pyi +34 -0
- gllm_docproc/validator/character_count_validator.pyi +26 -0
- gllm_docproc/validator/file_size_validator.pyi +20 -0
- gllm_docproc/validator/model/__init__.pyi +4 -0
- gllm_docproc/validator/model/validator_input.pyi +50 -0
- gllm_docproc/validator/model/validator_result.pyi +19 -0
- gllm_docproc/validator/page_count_validator.pyi +23 -0
- gllm_docproc/validator/pipeline_validator.pyi +40 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cp311-win_amd64.pyd +0 -0
- gllm_docproc.pyi +220 -0
- gllm_docproc_binary-0.7.22.dist-info/METADATA +216 -0
- gllm_docproc_binary-0.7.22.dist-info/RECORD +167 -0
- gllm_docproc_binary-0.7.22.dist-info/WHEEL +5 -0
- gllm_docproc_binary-0.7.22.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.model.element import Element as Element, HEADING as HEADING, PARAGRAPH as PARAGRAPH, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
|
|
3
|
+
from gllm_docproc.parser.base_parser import BaseParser as BaseParser
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
TABLE_AND_CAPTION_STRUCTURE: Incomplete
|
|
7
|
+
UPPER_ELEMENT_IS_CAPTION: str
|
|
8
|
+
LOWER_ELEMENT_IS_CAPTION: str
|
|
9
|
+
MAX_CAPTION_LENGTH: str
|
|
10
|
+
REMOVE_CAPTION_FROM_ELEMENT: str
|
|
11
|
+
MAX_CAPTION_ELEMENTS: str
|
|
12
|
+
UPPER_CAPTION_EXTRACTOR: str
|
|
13
|
+
LOWER_CAPTION_EXTRACTOR: str
|
|
14
|
+
|
|
15
|
+
def curry_upper_caption_extractor(remove_caption_from_element: bool):
|
|
16
|
+
"""Curry Upper Caption Extractor.
|
|
17
|
+
|
|
18
|
+
This function curries the extract_upper_caption function with the remove_caption_from_element parameters.
|
|
19
|
+
|
|
20
|
+
Why we need to use currying?
|
|
21
|
+
1. so user can customize the upper_caption_extractor function
|
|
22
|
+
2. the customize upper_caption_extractor may not require the remove_caption_from_element parameter
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
remove_caption_from_element (bool): A boolean value to remove the caption from the element.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
function: The function to extract the upper caption.
|
|
29
|
+
"""
|
|
30
|
+
def curry_lower_caption_extractor(remove_caption_from_element: bool):
|
|
31
|
+
"""Curry Lower Caption Extractor.
|
|
32
|
+
|
|
33
|
+
This function curries the extract_lower_caption function with the remove_caption_from_element parameters.
|
|
34
|
+
|
|
35
|
+
Why we need to use currying?
|
|
36
|
+
1. so user can customize the lower_caption_extractor function
|
|
37
|
+
2. the customize lower_caption_extractor may not require the remove_caption_from_element parameter
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
remove_caption_from_element (bool): A boolean value to remove the caption from the element.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
function: The function to extract the lower caption.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
class TableCaptionParser(BaseParser):
|
|
47
|
+
"""TableCaptionParser class.
|
|
48
|
+
|
|
49
|
+
A class to extract table captions from a document and add them to the metadata of the table element.
|
|
50
|
+
|
|
51
|
+
Methods:
|
|
52
|
+
parse(loaded_elements, **kwargs): Extract table captions from a document and add them to
|
|
53
|
+
the metadata of the table element.
|
|
54
|
+
"""
|
|
55
|
+
def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
|
|
56
|
+
"""Parses the elements to extract table captions.
|
|
57
|
+
|
|
58
|
+
This method extracts table captions from the elements and adds them to the metadata of the table element.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
loaded_elements (list[dict[str, Any]]): The elements to extract table captions from.
|
|
62
|
+
**kwargs (Any): Additional keyword arguments for customization.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
list[dict[str, Any]]: The elements with the table captions added to the metadata.
|
|
66
|
+
"""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
class BaseRequestHandler(ABC):
|
|
5
|
+
"""Base class for request handler."""
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def handle_request(self, **kwargs: Any) -> None:
|
|
8
|
+
"""Handles a request.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
**kwargs (Any): Arbitrary keyword arguments.
|
|
12
|
+
The implementing class is responsible to define the arguments
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
"""
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
class BaseResponseHandler(ABC):
|
|
5
|
+
"""Base class for document converter."""
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def handle_success_response(self, **kwargs: Any) -> None:
|
|
8
|
+
"""Handles a success response (successfully indexed).
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
**kwargs (Any): Arbitrary keyword arguments.
|
|
12
|
+
The implementing class is responsible to define the arguments
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
"""
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def handle_deleted_response(self, **kwargs: Any) -> None:
|
|
19
|
+
"""Handles a deleted response (successfully deleted).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
**kwargs (Any): Arbitrary keyword arguments.
|
|
23
|
+
The implementing class is responsible to define the arguments
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
None
|
|
27
|
+
"""
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def handle_failed_response(self, **kwargs: Any) -> None:
|
|
30
|
+
"""Handles a failed response (either failed to index or failed to delete).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
**kwargs (Any): Arbitrary keyword arguments.
|
|
34
|
+
The implementing class is responsible to define the arguments
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
None
|
|
38
|
+
"""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Awaitable, TypeVar
|
|
2
|
+
|
|
3
|
+
T = TypeVar('T')
|
|
4
|
+
|
|
5
|
+
def run_async_in_sync(coro: Awaitable[T]) -> T:
|
|
6
|
+
'''Run an async coroutine from synchronous code safely.
|
|
7
|
+
|
|
8
|
+
This function handles the common scenario where you need to call an async function
|
|
9
|
+
from synchronous code, but you\'re not sure if there\'s already an event loop running.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
coro (Awaitable[T]): The coroutine to run.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
T: The result of the coroutine.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> async def fetch_data():
|
|
19
|
+
... return "data"
|
|
20
|
+
>>> result = run_async_in_sync(fetch_data())
|
|
21
|
+
>>> print(result) # "data"
|
|
22
|
+
'''
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
def create_folder(folder_path: str) -> None:
|
|
4
|
+
"""Create a folder.
|
|
5
|
+
|
|
6
|
+
This function check if the folder path exists. If the folder path does not
|
|
7
|
+
exist, the function creates a folder in the specified folder path.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
folder_path (str): The folder path to create.
|
|
11
|
+
"""
|
|
12
|
+
def create_full_path(dir_path: str, filename: str, file_extension: str) -> str:
|
|
13
|
+
"""Create a full path for a file.
|
|
14
|
+
|
|
15
|
+
This function creates a full path for a file by combining the directory
|
|
16
|
+
path, the filename, and the file extension.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
dir_path (str): The directory path.
|
|
20
|
+
filename (str): The filename.
|
|
21
|
+
file_extension (str): The file extension.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str: The full path for the file.
|
|
25
|
+
"""
|
|
26
|
+
def save_to_json(elements: list[dict[str, Any]] | dict[str, Any], folder_path: str, file_name: str) -> str:
|
|
27
|
+
"""Save a list of elements to a JSON file.
|
|
28
|
+
|
|
29
|
+
This function saves a list of elements to a JSON file. The function takes
|
|
30
|
+
the list of elements, the folder path, and the file name as input and saves
|
|
31
|
+
the elements to a JSON file in the specified folder.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
elements (list[dict[str, Any]] | dict[str, Any]): The list of elements to save.
|
|
35
|
+
folder_path (str): The folder path to save the JSON file.
|
|
36
|
+
file_name (str): The file name of the JSON file.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
str: The full filepath of the created JSON file.
|
|
40
|
+
"""
|
|
41
|
+
def save_to_csv(elements: list[dict[str, Any]], folder_path: str, file_name: str) -> None:
|
|
42
|
+
"""Save a list of elements to a CSV file.
|
|
43
|
+
|
|
44
|
+
This function saves a list of elements to a CSV file. The function takes
|
|
45
|
+
the list of elements, the folder path, and the file name as input and saves
|
|
46
|
+
the elements to a CSV file in the specified folder.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
elements (list[dict[str, Any]]): The list of elements to save.
|
|
50
|
+
folder_path (str): The folder path to save the CSV file.
|
|
51
|
+
file_name (str): The file name of the CSV file.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
None
|
|
55
|
+
"""
|
|
56
|
+
def save_file(content: str, filename: str):
|
|
57
|
+
"""Save the content to a file.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
content (str): The content to save.
|
|
61
|
+
filename (str): The filename to save the content to.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
None
|
|
65
|
+
"""
|
|
66
|
+
def read_json_file(file_path: str) -> list[dict[str, Any]] | dict[str, Any]:
|
|
67
|
+
"""Read a JSON file.
|
|
68
|
+
|
|
69
|
+
This function reads a JSON file and returns the content of the JSON file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path (str): The path of the JSON file to read.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
list[dict[str, Any]] | dict[str, Any]: The content of the JSON file.
|
|
76
|
+
"""
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.model.element import AUDIO as AUDIO, FOOTER as FOOTER, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, TABLE as TABLE, TITLE as TITLE, VIDEO as VIDEO
|
|
3
|
+
|
|
4
|
+
FORMATTING_TAGS: Incomplete
|
|
5
|
+
SPACING: str
|
|
6
|
+
|
|
7
|
+
class MetaDataKeys:
|
|
8
|
+
"""Represents keys commonly used in metadata for web content."""
|
|
9
|
+
CHARSET: str
|
|
10
|
+
PROPERTY: str
|
|
11
|
+
CONTENT: str
|
|
12
|
+
NAME: str
|
|
13
|
+
HTTP_EQUIV: str
|
|
14
|
+
URL: str
|
|
15
|
+
TITLE: str
|
|
16
|
+
METADATA: str
|
|
17
|
+
SOURCE: str
|
|
18
|
+
SOURCE_TYPE: str
|
|
19
|
+
LOADED_DATETIME: str
|
|
20
|
+
|
|
21
|
+
class ContentDataKeys:
|
|
22
|
+
"""Represents keys commonly used in web content data."""
|
|
23
|
+
TAG: str
|
|
24
|
+
CONTENT: str
|
|
25
|
+
SOURCE: str
|
|
26
|
+
TYPE: str
|
|
27
|
+
SRC: str
|
|
28
|
+
PLACEHOLDER: str
|
|
29
|
+
TABLE: str
|
|
30
|
+
HREF: str
|
|
31
|
+
ALT: str
|
|
32
|
+
CLASS: str
|
|
33
|
+
VALUE: str
|
|
34
|
+
|
|
35
|
+
class ItemDataKeys:
|
|
36
|
+
"""Represents keys used for handling item data."""
|
|
37
|
+
ELEMENTS: str
|
|
38
|
+
TEXT: str
|
|
39
|
+
STRUCTURE: str
|
|
40
|
+
ELEMENT_ID: str
|
|
41
|
+
INDEX: str
|
|
42
|
+
LINK: str
|
|
43
|
+
FORMATS: str
|
|
44
|
+
COMBINE_PREV: str
|
|
45
|
+
LIST_TYPE: str
|
|
46
|
+
IS_LIST_FIRST_ITEM: str
|
|
47
|
+
METADATA: str
|
|
48
|
+
URL: str
|
|
49
|
+
GROUP_ID: str
|
|
50
|
+
PARENT_ID: str
|
|
51
|
+
LINE_BREAK: str
|
|
52
|
+
HTML_TAGS: str
|
|
53
|
+
ROW_ITEM: str
|
|
54
|
+
COLSPAN: str
|
|
55
|
+
ROWSPAN: str
|
|
56
|
+
|
|
57
|
+
class HTMLTags:
|
|
58
|
+
"""Represents commonly used HTML tags as constants."""
|
|
59
|
+
IMG: str
|
|
60
|
+
INPUT: str
|
|
61
|
+
SVG: str
|
|
62
|
+
SOURCE: str
|
|
63
|
+
TABLE: str
|
|
64
|
+
A: str
|
|
65
|
+
VIDEO: str
|
|
66
|
+
AUDIO: str
|
|
67
|
+
IFRAME: str
|
|
68
|
+
EMBED: str
|
|
69
|
+
TEXT: str
|
|
70
|
+
UL: str
|
|
71
|
+
OL: str
|
|
72
|
+
LI: str
|
|
73
|
+
P: str
|
|
74
|
+
BR: str
|
|
75
|
+
H: Incomplete
|
|
76
|
+
HEADER: str
|
|
77
|
+
TITLE: str
|
|
78
|
+
FOOTER: str
|
|
79
|
+
MEDIA_TAGS: Incomplete
|
|
80
|
+
TR: str
|
|
81
|
+
TD: str
|
|
82
|
+
TH: str
|
|
83
|
+
TBODY: str
|
|
84
|
+
TFOOT: str
|
|
85
|
+
THEAD: str
|
|
86
|
+
IMAGE_TAGS: Incomplete
|
|
87
|
+
|
|
88
|
+
class ErrorMessage:
|
|
89
|
+
"""Represents predefined error messages used in the application."""
|
|
90
|
+
ERROR_FAILED_SAVE_JSON: str
|
|
91
|
+
ERROR_FAILED_SAVE_CSV: str
|
|
92
|
+
ERROR_FAILED_EXTRACT_DATA: str
|
|
93
|
+
ERROR_MISSING_KEY: str
|
|
94
|
+
ERROR_FAILED_TO_PROCESS_ITEM: str
|
|
95
|
+
ERROR_FAILED_TO_OPEN_SPIDER: str
|
|
96
|
+
ERROR_UNKNOWN_SOURCE: str
|
|
97
|
+
|
|
98
|
+
class Structure:
|
|
99
|
+
"""Represents the structure of the content."""
|
|
100
|
+
@classmethod
|
|
101
|
+
def get_structure(cls, tag: str):
|
|
102
|
+
"""Get the structure associated with the given HTML tag.
|
|
103
|
+
|
|
104
|
+
This class method maps HTML tags to their corresponding structure types and returns the
|
|
105
|
+
structure associated with the provided HTML tag.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
tag (str): The HTML tag for which to retrieve the structure.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str or None: The structure associated with the HTML tag, or None if the tag is not mapped.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
class TableConstants:
|
|
115
|
+
"""Represents constants used for table extraction."""
|
|
116
|
+
TABLE_META_KEY: str
|
|
117
|
+
TABLE_CONTENT_KEY: str
|
|
118
|
+
TABLE_ROW_TYPE_KEY: str
|
|
119
|
+
MAX_CHAR_COUNT_PER_COLUMN: str
|
|
120
|
+
HEADER: str
|
|
121
|
+
BODY: str
|
|
122
|
+
FOOTER: str
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from gllm_docproc.validator.character_count_validator import CharacterCountValidator as CharacterCountValidator
|
|
2
|
+
from gllm_docproc.validator.file_size_validator import FileSizeValidator as FileSizeValidator
|
|
3
|
+
from gllm_docproc.validator.page_count_validator import PageCountValidator as PageCountValidator
|
|
4
|
+
from gllm_docproc.validator.pipeline_validator import PipelineValidator as PipelineValidator
|
|
5
|
+
|
|
6
|
+
__all__ = ['PipelineValidator', 'CharacterCountValidator', 'FileSizeValidator', 'PageCountValidator']
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
|
|
4
|
+
from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
|
|
5
|
+
|
|
6
|
+
class BaseValidator(ABC):
|
|
7
|
+
"""Abstract base class for file validators.
|
|
8
|
+
|
|
9
|
+
This class defines the interface that all file validators must implement.
|
|
10
|
+
Each validator should validate a specific aspect of a file and return
|
|
11
|
+
a ValidatorResult indicating success/failure and an appropriate message.
|
|
12
|
+
"""
|
|
13
|
+
stop_on_failure: Incomplete
|
|
14
|
+
applicable_extensions: Incomplete
|
|
15
|
+
logger: Incomplete
|
|
16
|
+
def __init__(self, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
|
|
17
|
+
"""Initialize the BaseValidator.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
stop_on_failure (bool, optional): Whether to terminate the validation process if this validator fails.
|
|
21
|
+
Default is False.
|
|
22
|
+
applicable_extensions (list[str] | None, optional): The list of file extensions that this validator is
|
|
23
|
+
applicable to. Default is None which means all extensions are applicable.
|
|
24
|
+
"""
|
|
25
|
+
def validate(self, file_validation_input: ValidatorInput) -> ValidatorResult:
|
|
26
|
+
"""Validate the file against the validator's criteria.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
file_validation_input (ValidatorInput): The ValidatorInput object to validate.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
ValidatorResult: A ValidatorResult object indicating success or failure
|
|
33
|
+
with an appropriate message.
|
|
34
|
+
"""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.loader.csv.pandas_loader import CSV_VARIANTS as CSV_VARIANTS
|
|
3
|
+
from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
|
|
4
|
+
from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
|
|
5
|
+
from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
|
|
6
|
+
|
|
7
|
+
class CharacterCountValidator(BaseValidator):
|
|
8
|
+
"""Validator for checking if the total character length of file content does not exceed a maximum limit.
|
|
9
|
+
|
|
10
|
+
Character length counting is currently supported for:
|
|
11
|
+
- CSV files (csv, tsv, psv, ssv)
|
|
12
|
+
- TXT files
|
|
13
|
+
"""
|
|
14
|
+
CHUNK_SIZE_BYTES: Incomplete
|
|
15
|
+
max_character_length: Incomplete
|
|
16
|
+
def __init__(self, max_character_length: int = 500000, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
|
|
17
|
+
"""Initialize the CharacterCountValidator.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
max_character_length (int, optional): The maximum allowed character length for the file.
|
|
21
|
+
Default is 500,000 characters. It should be greater than 0.
|
|
22
|
+
stop_on_failure (bool, optional): Whether to stop the validation process if this
|
|
23
|
+
validator fails. Default is False.
|
|
24
|
+
applicable_extensions (list[str] | None, optional): The list of file extensions that this validator
|
|
25
|
+
is applicable to. Default is None which means all extensions are applicable.
|
|
26
|
+
"""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
|
|
3
|
+
from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
|
|
4
|
+
from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
|
|
5
|
+
|
|
6
|
+
class FileSizeValidator(BaseValidator):
|
|
7
|
+
"""Validator for checking if file size does not exceed a maximum limit."""
|
|
8
|
+
max_file_size: Incomplete
|
|
9
|
+
def __init__(self, max_file_size: int = 10485760, stop_on_failure: bool = True, applicable_extensions: list[str] | None = None) -> None:
|
|
10
|
+
"""Initialize the FileSizeValidator.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
max_file_size (int, optional): The maximum allowed size for the file in bytes.
|
|
14
|
+
Default is 10,485,760 bytes (10MB). It should be greater than 0.
|
|
15
|
+
stop_on_failure (bool, optional): Whether to stop the validation process if this
|
|
16
|
+
validator fails. Default is True.
|
|
17
|
+
applicable_extensions (list[str] | None, optional): The list of file extensions
|
|
18
|
+
that this validator is applicable to. Default is None which means
|
|
19
|
+
all extensions are applicable.
|
|
20
|
+
"""
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from types import TracebackType
|
|
4
|
+
from typing import BinaryIO
|
|
5
|
+
|
|
6
|
+
class ValidatorInput(BaseModel):
|
|
7
|
+
"""File object used for validation.
|
|
8
|
+
|
|
9
|
+
Attributes:
|
|
10
|
+
name (str): File name (basename).
|
|
11
|
+
extension (str): File extension without leading dot, lowercased (e.g., 'pdf').
|
|
12
|
+
size (int): File size in bytes.
|
|
13
|
+
file (BinaryIO): Open binary file handle for content-based validations.
|
|
14
|
+
content_type (str | None): Optional content type (MIME), if known.
|
|
15
|
+
"""
|
|
16
|
+
model_config: Incomplete
|
|
17
|
+
name: str
|
|
18
|
+
extension: str
|
|
19
|
+
size: int
|
|
20
|
+
file: BinaryIO
|
|
21
|
+
content_type: str | None
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_path(cls, path: str) -> ValidatorInput:
|
|
24
|
+
"""Create a ValidatorInput from a local path (opens in rb mode).
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
path (str): The file path to create ValidatorInput from.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
ValidatorInput: A ValidatorInput instance.
|
|
31
|
+
"""
|
|
32
|
+
def close(self) -> None:
|
|
33
|
+
"""Close the underlying file handle if owned by this object.
|
|
34
|
+
|
|
35
|
+
This method is idempotent and will not raise an error if the file is already closed.
|
|
36
|
+
"""
|
|
37
|
+
def __enter__(self) -> ValidatorInput:
|
|
38
|
+
"""Enter the runtime context related to this object.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
ValidatorInput: The ValidatorInput instance itself.
|
|
42
|
+
"""
|
|
43
|
+
def __exit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None) -> None:
|
|
44
|
+
"""Exit the runtime context and close the file handle if owned.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
exc_type (type[BaseException] | None): The exception type, if any.
|
|
48
|
+
exc_val (BaseException | None): The exception value, if any.
|
|
49
|
+
exc_tb (TracebackType | None): The traceback, if any.
|
|
50
|
+
"""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
class ValidatorResult(BaseModel):
|
|
5
|
+
"""Represents the result of a validation operation.
|
|
6
|
+
|
|
7
|
+
This class encapsulates the result of a validation operation, including whether
|
|
8
|
+
the validation passed or failed and any associated message.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
is_valid (bool): Whether the validation passed or failed.
|
|
12
|
+
source_validator (str): Validator class name that produced this result.
|
|
13
|
+
message (str): The message associated with the validation result.
|
|
14
|
+
params (dict[str, Any]): The parameters associated with the validation result.
|
|
15
|
+
"""
|
|
16
|
+
is_valid: bool
|
|
17
|
+
source_validator: str
|
|
18
|
+
message: str
|
|
19
|
+
params: dict[str, Any]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
|
|
3
|
+
from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
|
|
4
|
+
from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
|
|
5
|
+
|
|
6
|
+
class PageCountValidator(BaseValidator):
|
|
7
|
+
"""Validator for checking if the number of pages in a file does not exceed a maximum limit.
|
|
8
|
+
|
|
9
|
+
Page counting is currently supported for PDF files only.
|
|
10
|
+
"""
|
|
11
|
+
max_pages: Incomplete
|
|
12
|
+
def __init__(self, max_pages: int = 100, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
|
|
13
|
+
"""Initialize the PageCountValidator.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
max_pages (int, optional): The maximum allowed number of pages. A non-negative
|
|
17
|
+
value enforces a limit. Default is 100 pages. It should be greater than 0.
|
|
18
|
+
stop_on_failure (bool, optional): Whether to stop the validation process if this
|
|
19
|
+
validator fails. Default is False.
|
|
20
|
+
applicable_extensions (list[str] | None, optional): The list of file extensions
|
|
21
|
+
that this validator is applicable to. Default is None which means
|
|
22
|
+
all extensions are applicable.
|
|
23
|
+
"""
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
|
|
2
|
+
from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
|
|
3
|
+
from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
|
|
4
|
+
|
|
5
|
+
class PipelineValidator:
|
|
6
|
+
"""A pipeline for validating files against multiple validation rules.
|
|
7
|
+
|
|
8
|
+
This class provides a flexible way to validate files by chaining multiple `BaseValidator`
|
|
9
|
+
instances. Each validator is applied sequentially, and validation behavior depends on
|
|
10
|
+
the `stop_on_failure` setting of each validator.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
validators (list[BaseValidator]): A list of `BaseValidator` instances to apply for file validation.
|
|
14
|
+
"""
|
|
15
|
+
validators: list[BaseValidator]
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
"""Initialize the PipelineValidator object."""
|
|
18
|
+
def add_validator(self, validator: BaseValidator) -> PipelineValidator:
|
|
19
|
+
"""Add a validator to the validation pipeline.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
validator (BaseValidator): The validator to add to the pipeline.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
PipelineValidator: The validation pipeline object for method chaining.
|
|
26
|
+
"""
|
|
27
|
+
def validate(self, file_validation_input: ValidatorInput) -> list[ValidatorResult]:
|
|
28
|
+
"""Validate the file against all configured validation rules.
|
|
29
|
+
|
|
30
|
+
Validation stops early if a validator fails and its `stop_on_failure`
|
|
31
|
+
setting is True; in that case, the returned list will only include results
|
|
32
|
+
up to and including the failing validator.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_validation_input (ValidatorInput): The file validation input object to validate.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
list[ValidatorResult]: A list of ValidatorResult objects for each validator run,
|
|
39
|
+
which may be truncated if a validator with `stop_on_failure=True` fails.
|
|
40
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*
|
|
Binary file
|