gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (168) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +7 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
  28. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  29. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  38. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  39. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  40. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  41. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  42. gllm_docproc/dpo_router/__init__.pyi +5 -0
  43. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  44. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  45. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  46. gllm_docproc/housekeeping/__init__.pyi +3 -0
  47. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  48. gllm_docproc/indexer/__init__.pyi +3 -0
  49. gllm_docproc/indexer/base_indexer.pyi +30 -0
  50. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  51. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  52. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  53. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  54. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  55. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  56. gllm_docproc/loader/__init__.pyi +4 -0
  57. gllm_docproc/loader/audio/__init__.pyi +3 -0
  58. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  59. gllm_docproc/loader/base_loader.pyi +30 -0
  60. gllm_docproc/loader/csv/__init__.pyi +3 -0
  61. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  62. gllm_docproc/loader/docx/__init__.pyi +5 -0
  63. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  64. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  65. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  66. gllm_docproc/loader/exception/__init__.pyi +4 -0
  67. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  68. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  69. gllm_docproc/loader/html/__init__.pyi +5 -0
  70. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  71. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  72. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  73. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
  74. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  75. gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
  76. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  77. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  78. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  79. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  80. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  81. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  82. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  83. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  84. gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
  85. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  86. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  87. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  88. gllm_docproc/loader/image/__init__.pyi +3 -0
  89. gllm_docproc/loader/image/image_loader.pyi +54 -0
  90. gllm_docproc/loader/json/__init__.pyi +3 -0
  91. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  92. gllm_docproc/loader/loader_utils.pyi +43 -0
  93. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  94. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  96. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  97. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  98. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  99. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  100. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  101. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  102. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  103. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  104. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  105. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  106. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  107. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  108. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  109. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  110. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  111. gllm_docproc/loader/txt/__init__.pyi +3 -0
  112. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  113. gllm_docproc/loader/video/__init__.pyi +3 -0
  114. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  115. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  116. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  117. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  118. gllm_docproc/model/__init__.pyi +7 -0
  119. gllm_docproc/model/element.pyi +38 -0
  120. gllm_docproc/model/element_metadata.pyi +35 -0
  121. gllm_docproc/model/loader_type.pyi +20 -0
  122. gllm_docproc/model/media.pyi +51 -0
  123. gllm_docproc/model/parser_type.pyi +19 -0
  124. gllm_docproc/parser/__init__.pyi +4 -0
  125. gllm_docproc/parser/base_parser.pyi +28 -0
  126. gllm_docproc/parser/document/__init__.pyi +7 -0
  127. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  128. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  129. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  130. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  131. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  132. gllm_docproc/parser/html/__init__.pyi +4 -0
  133. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  134. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  135. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  136. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  137. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  138. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  139. gllm_docproc/parser/image/__init__.pyi +4 -0
  140. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  141. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  142. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  143. gllm_docproc/parser/table/__init__.pyi +3 -0
  144. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  145. gllm_docproc/request_handler/__init__.pyi +3 -0
  146. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  147. gllm_docproc/response_handler/__init__.pyi +3 -0
  148. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  149. gllm_docproc/utils/__init__.pyi +3 -0
  150. gllm_docproc/utils/async_utils.pyi +22 -0
  151. gllm_docproc/utils/file_utils.pyi +76 -0
  152. gllm_docproc/utils/html_constants.pyi +122 -0
  153. gllm_docproc/validator/__init__.pyi +6 -0
  154. gllm_docproc/validator/base_validator.pyi +34 -0
  155. gllm_docproc/validator/character_count_validator.pyi +26 -0
  156. gllm_docproc/validator/file_size_validator.pyi +20 -0
  157. gllm_docproc/validator/model/__init__.pyi +4 -0
  158. gllm_docproc/validator/model/validator_input.pyi +50 -0
  159. gllm_docproc/validator/model/validator_result.pyi +19 -0
  160. gllm_docproc/validator/page_count_validator.pyi +23 -0
  161. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  162. gllm_docproc.build/.gitignore +1 -0
  163. gllm_docproc.cpython-311-darwin.so +0 -0
  164. gllm_docproc.pyi +222 -0
  165. gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
  166. gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
  167. gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
  168. gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0
@@ -0,0 +1,45 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE
3
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
4
+ from typing import Any
5
+
6
+ logger: Incomplete
7
+
8
+ class ImagePlainSmallFilterParser(BaseParser):
9
+ """ImagePlainSmallFilterParser class.
10
+
11
+ A class to filter image elements from a document based on size requirements
12
+ and meaningful content analysis. This parser focuses on filtering operations
13
+ and does not perform coordinate-based transformations.
14
+
15
+ The parser filters images by:
16
+ 1. Checking minimum dimension requirements
17
+ 2. Validating meaningful content using contrast analysis
18
+
19
+ Methods:
20
+ parse(loaded_elements, **kwargs): Filter image elements in the document.
21
+ """
22
+ min_width: Incomplete
23
+ min_height: Incomplete
24
+ contrast_threshold: Incomplete
25
+ def __init__(self, min_width: int = 4, min_height: int = 4, contrast_threshold: float = 0.05) -> None:
26
+ """Initialize the ImageFilterParser.
27
+
28
+ Args:
29
+ min_width (int, optional): Minimum required width. Defaults to 4.
30
+ min_height (int, optional): Minimum required height. Defaults to 4.
31
+ contrast_threshold (float, optional): Contrast quality threshold. Defaults to 0.05.
32
+ """
33
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
34
+ """Filter image elements in the document.
35
+
36
+ This method processes image elements by validating dimensions and content quality.
37
+ Images that don't meet the criteria are removed from the document.
38
+
39
+ Args:
40
+ loaded_elements (list[dict[str, Any]]): The elements to process.
41
+ **kwargs (Any): Additional keyword arguments for customization.
42
+
43
+ Returns:
44
+ list[dict[str, Any]]: The filtered elements with valid images only.
45
+ """
@@ -0,0 +1,33 @@
1
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
2
+ from typing import Any
3
+
4
+ class PipelineParser:
5
+ """Pipeline parser for parsing documents.
6
+
7
+ This class serves as the pipeline parser for parsing documents. It defines the structure for
8
+ parsing documents with several parsers using pipeline.
9
+
10
+ Methods:
11
+ add_parser(parser): Add parser to the pipeline parser.
12
+ parse(elements, **kwargs): Parse the elements using parsers.
13
+ """
14
+ parsers: list[BaseParser]
15
+ def __init__(self) -> None:
16
+ """Initialize the pipeline parser."""
17
+ def add_parser(self, parser: BaseParser):
18
+ """Add parser to the pipeline parser.
19
+
20
+ This method defines the process of adding parser to the pipeline parser.
21
+
22
+ Args:
23
+ parser (BaseParser): The parser to be added.
24
+ """
25
+ def parse(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
26
+ """Parse the elements using pipeline parser.
27
+
28
+ This method defines the process of parsing the elements using parsers.
29
+
30
+ Args:
31
+ elements (list[dict[str, Any]]): A list of dictionaries containing elements.
32
+ **kwargs (Any): Additional keyword arguments.
33
+ """
@@ -0,0 +1,3 @@
1
+ from .table_caption_parser import TableCaptionParser as TableCaptionParser
2
+
3
+ __all__ = ['TableCaptionParser']
@@ -0,0 +1,66 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import Element as Element, HEADING as HEADING, PARAGRAPH as PARAGRAPH, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
3
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
4
+ from typing import Any
5
+
6
+ TABLE_AND_CAPTION_STRUCTURE: Incomplete
7
+ UPPER_ELEMENT_IS_CAPTION: str
8
+ LOWER_ELEMENT_IS_CAPTION: str
9
+ MAX_CAPTION_LENGTH: str
10
+ REMOVE_CAPTION_FROM_ELEMENT: str
11
+ MAX_CAPTION_ELEMENTS: str
12
+ UPPER_CAPTION_EXTRACTOR: str
13
+ LOWER_CAPTION_EXTRACTOR: str
14
+
15
+ def curry_upper_caption_extractor(remove_caption_from_element: bool):
16
+ """Curry Upper Caption Extractor.
17
+
18
+ This function curries the extract_upper_caption function with the remove_caption_from_element parameters.
19
+
20
+ Why we need to use currying?
21
+ 1. so user can customize the upper_caption_extractor function
22
+ 2. the customize upper_caption_extractor may not require the remove_caption_from_element parameter
23
+
24
+ Args:
25
+ remove_caption_from_element (bool): A boolean value to remove the caption from the element.
26
+
27
+ Returns:
28
+ function: The function to extract the upper caption.
29
+ """
30
+ def curry_lower_caption_extractor(remove_caption_from_element: bool):
31
+ """Curry Lower Caption Extractor.
32
+
33
+ This function curries the extract_lower_caption function with the remove_caption_from_element parameters.
34
+
35
+ Why we need to use currying?
36
+ 1. so user can customize the lower_caption_extractor function
37
+ 2. the customize lower_caption_extractor may not require the remove_caption_from_element parameter
38
+
39
+ Args:
40
+ remove_caption_from_element (bool): A boolean value to remove the caption from the element.
41
+
42
+ Returns:
43
+ function: The function to extract the lower caption.
44
+ """
45
+
46
+ class TableCaptionParser(BaseParser):
47
+ """TableCaptionParser class.
48
+
49
+ A class to extract table captions from a document and add them to the metadata of the table element.
50
+
51
+ Methods:
52
+ parse(loaded_elements, **kwargs): Extract table captions from a document and add them to
53
+ the metadata of the table element.
54
+ """
55
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
56
+ """Parses the elements to extract table captions.
57
+
58
+ This method extracts table captions from the elements and adds them to the metadata of the table element.
59
+
60
+ Args:
61
+ loaded_elements (list[dict[str, Any]]): The elements to extract table captions from.
62
+ **kwargs (Any): Additional keyword arguments for customization.
63
+
64
+ Returns:
65
+ list[dict[str, Any]]: The elements with the table captions added to the metadata.
66
+ """
@@ -0,0 +1,3 @@
1
+ from .base_request_handler import BaseRequestHandler as BaseRequestHandler
2
+
3
+ __all__ = ['BaseRequestHandler']
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseRequestHandler(ABC):
5
+ """Base class for request handler."""
6
+ @abstractmethod
7
+ def handle_request(self, **kwargs: Any) -> None:
8
+ """Handles a request.
9
+
10
+ Args:
11
+ **kwargs (Any): Arbitrary keyword arguments.
12
+ The implementing class is responsible to define the arguments
13
+
14
+ Returns:
15
+ None
16
+ """
@@ -0,0 +1,3 @@
1
+ from .base_response_handler import BaseResponseHandler as BaseResponseHandler
2
+
3
+ __all__ = ['BaseResponseHandler']
@@ -0,0 +1,38 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseResponseHandler(ABC):
5
+ """Base class for document converter."""
6
+ @abstractmethod
7
+ def handle_success_response(self, **kwargs: Any) -> None:
8
+ """Handles a success response (successfully indexed).
9
+
10
+ Args:
11
+ **kwargs (Any): Arbitrary keyword arguments.
12
+ The implementing class is responsible to define the arguments
13
+
14
+ Returns:
15
+ None
16
+ """
17
+ @abstractmethod
18
+ def handle_deleted_response(self, **kwargs: Any) -> None:
19
+ """Handles a deleted response (successfully deleted).
20
+
21
+ Args:
22
+ **kwargs (Any): Arbitrary keyword arguments.
23
+ The implementing class is responsible to define the arguments
24
+
25
+ Returns:
26
+ None
27
+ """
28
+ @abstractmethod
29
+ def handle_failed_response(self, **kwargs: Any) -> None:
30
+ """Handles a failed response (either failed to index or failed to delete).
31
+
32
+ Args:
33
+ **kwargs (Any): Arbitrary keyword arguments.
34
+ The implementing class is responsible to define the arguments
35
+
36
+ Returns:
37
+ None
38
+ """
@@ -0,0 +1,3 @@
1
+ from .async_utils import run_async_in_sync as run_async_in_sync
2
+
3
+ __all__ = ['run_async_in_sync']
@@ -0,0 +1,22 @@
1
+ from typing import Awaitable, TypeVar
2
+
3
+ T = TypeVar('T')
4
+
5
+ def run_async_in_sync(coro: Awaitable[T]) -> T:
6
+ '''Run an async coroutine from synchronous code safely.
7
+
8
+ This function handles the common scenario where you need to call an async function
9
+ from synchronous code, but you\'re not sure if there\'s already an event loop running.
10
+
11
+ Args:
12
+ coro (Awaitable[T]): The coroutine to run.
13
+
14
+ Returns:
15
+ T: The result of the coroutine.
16
+
17
+ Example:
18
+ >>> async def fetch_data():
19
+ ... return "data"
20
+ >>> result = run_async_in_sync(fetch_data())
21
+ >>> print(result) # "data"
22
+ '''
@@ -0,0 +1,76 @@
1
+ from typing import Any
2
+
3
+ def create_folder(folder_path: str) -> None:
4
+ """Create a folder.
5
+
6
+ This function check if the folder path exists. If the folder path does not
7
+ exist, the function creates a folder in the specified folder path.
8
+
9
+ Args:
10
+ folder_path (str): The folder path to create.
11
+ """
12
+ def create_full_path(dir_path: str, filename: str, file_extension: str) -> str:
13
+ """Create a full path for a file.
14
+
15
+ This function creates a full path for a file by combining the directory
16
+ path, the filename, and the file extension.
17
+
18
+ Args:
19
+ dir_path (str): The directory path.
20
+ filename (str): The filename.
21
+ file_extension (str): The file extension.
22
+
23
+ Returns:
24
+ str: The full path for the file.
25
+ """
26
+ def save_to_json(elements: list[dict[str, Any]] | dict[str, Any], folder_path: str, file_name: str) -> str:
27
+ """Save a list of elements to a JSON file.
28
+
29
+ This function saves a list of elements to a JSON file. The function takes
30
+ the list of elements, the folder path, and the file name as input and saves
31
+ the elements to a JSON file in the specified folder.
32
+
33
+ Args:
34
+ elements (list[dict[str, Any]] | dict[str, Any]): The list of elements to save.
35
+ folder_path (str): The folder path to save the JSON file.
36
+ file_name (str): The file name of the JSON file.
37
+
38
+ Returns:
39
+ str: The full filepath of the created JSON file.
40
+ """
41
+ def save_to_csv(elements: list[dict[str, Any]], folder_path: str, file_name: str) -> None:
42
+ """Save a list of elements to a CSV file.
43
+
44
+ This function saves a list of elements to a CSV file. The function takes
45
+ the list of elements, the folder path, and the file name as input and saves
46
+ the elements to a CSV file in the specified folder.
47
+
48
+ Args:
49
+ elements (list[dict[str, Any]]): The list of elements to save.
50
+ folder_path (str): The folder path to save the CSV file.
51
+ file_name (str): The file name of the CSV file.
52
+
53
+ Returns:
54
+ None
55
+ """
56
+ def save_file(content: str, filename: str):
57
+ """Save the content to a file.
58
+
59
+ Args:
60
+ content (str): The content to save.
61
+ filename (str): The filename to save the content to.
62
+
63
+ Returns:
64
+ None
65
+ """
66
+ def read_json_file(file_path: str) -> list[dict[str, Any]] | dict[str, Any]:
67
+ """Read a JSON file.
68
+
69
+ This function reads a JSON file and returns the content of the JSON file.
70
+
71
+ Args:
72
+ file_path (str): The path of the JSON file to read.
73
+
74
+ Returns:
75
+ list[dict[str, Any]] | dict[str, Any]: The content of the JSON file.
76
+ """
@@ -0,0 +1,122 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import AUDIO as AUDIO, FOOTER as FOOTER, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, TABLE as TABLE, TITLE as TITLE, VIDEO as VIDEO
3
+
4
+ FORMATTING_TAGS: Incomplete
5
+ SPACING: str
6
+
7
+ class MetaDataKeys:
8
+ """Represents keys commonly used in metadata for web content."""
9
+ CHARSET: str
10
+ PROPERTY: str
11
+ CONTENT: str
12
+ NAME: str
13
+ HTTP_EQUIV: str
14
+ URL: str
15
+ TITLE: str
16
+ METADATA: str
17
+ SOURCE: str
18
+ SOURCE_TYPE: str
19
+ LOADED_DATETIME: str
20
+
21
+ class ContentDataKeys:
22
+ """Represents keys commonly used in web content data."""
23
+ TAG: str
24
+ CONTENT: str
25
+ SOURCE: str
26
+ TYPE: str
27
+ SRC: str
28
+ PLACEHOLDER: str
29
+ TABLE: str
30
+ HREF: str
31
+ ALT: str
32
+ CLASS: str
33
+ VALUE: str
34
+
35
+ class ItemDataKeys:
36
+ """Represents keys used for handling item data."""
37
+ ELEMENTS: str
38
+ TEXT: str
39
+ STRUCTURE: str
40
+ ELEMENT_ID: str
41
+ INDEX: str
42
+ LINK: str
43
+ FORMATS: str
44
+ COMBINE_PREV: str
45
+ LIST_TYPE: str
46
+ IS_LIST_FIRST_ITEM: str
47
+ METADATA: str
48
+ URL: str
49
+ GROUP_ID: str
50
+ PARENT_ID: str
51
+ LINE_BREAK: str
52
+ HTML_TAGS: str
53
+ ROW_ITEM: str
54
+ COLSPAN: str
55
+ ROWSPAN: str
56
+
57
+ class HTMLTags:
58
+ """Represents commonly used HTML tags as constants."""
59
+ IMG: str
60
+ INPUT: str
61
+ SVG: str
62
+ SOURCE: str
63
+ TABLE: str
64
+ A: str
65
+ VIDEO: str
66
+ AUDIO: str
67
+ IFRAME: str
68
+ EMBED: str
69
+ TEXT: str
70
+ UL: str
71
+ OL: str
72
+ LI: str
73
+ P: str
74
+ BR: str
75
+ H: Incomplete
76
+ HEADER: str
77
+ TITLE: str
78
+ FOOTER: str
79
+ MEDIA_TAGS: Incomplete
80
+ TR: str
81
+ TD: str
82
+ TH: str
83
+ TBODY: str
84
+ TFOOT: str
85
+ THEAD: str
86
+ IMAGE_TAGS: Incomplete
87
+
88
+ class ErrorMessage:
89
+ """Represents predefined error messages used in the application."""
90
+ ERROR_FAILED_SAVE_JSON: str
91
+ ERROR_FAILED_SAVE_CSV: str
92
+ ERROR_FAILED_EXTRACT_DATA: str
93
+ ERROR_MISSING_KEY: str
94
+ ERROR_FAILED_TO_PROCESS_ITEM: str
95
+ ERROR_FAILED_TO_OPEN_SPIDER: str
96
+ ERROR_UNKNOWN_SOURCE: str
97
+
98
+ class Structure:
99
+ """Represents the structure of the content."""
100
+ @classmethod
101
+ def get_structure(cls, tag: str):
102
+ """Get the structure associated with the given HTML tag.
103
+
104
+ This class method maps HTML tags to their corresponding structure types and returns the
105
+ structure associated with the provided HTML tag.
106
+
107
+ Args:
108
+ tag (str): The HTML tag for which to retrieve the structure.
109
+
110
+ Returns:
111
+ str or None: The structure associated with the HTML tag, or None if the tag is not mapped.
112
+ """
113
+
114
+ class TableConstants:
115
+ """Represents constants used for table extraction."""
116
+ TABLE_META_KEY: str
117
+ TABLE_CONTENT_KEY: str
118
+ TABLE_ROW_TYPE_KEY: str
119
+ MAX_CHAR_COUNT_PER_COLUMN: str
120
+ HEADER: str
121
+ BODY: str
122
+ FOOTER: str
@@ -0,0 +1,6 @@
1
+ from gllm_docproc.validator.character_count_validator import CharacterCountValidator as CharacterCountValidator
2
+ from gllm_docproc.validator.file_size_validator import FileSizeValidator as FileSizeValidator
3
+ from gllm_docproc.validator.page_count_validator import PageCountValidator as PageCountValidator
4
+ from gllm_docproc.validator.pipeline_validator import PipelineValidator as PipelineValidator
5
+
6
+ __all__ = ['PipelineValidator', 'CharacterCountValidator', 'FileSizeValidator', 'PageCountValidator']
@@ -0,0 +1,34 @@
1
+ from _typeshed import Incomplete
2
+ from abc import ABC
3
+ from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
4
+ from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
5
+
6
+ class BaseValidator(ABC):
7
+ """Abstract base class for file validators.
8
+
9
+ This class defines the interface that all file validators must implement.
10
+ Each validator should validate a specific aspect of a file and return
11
+ a ValidatorResult indicating success/failure and an appropriate message.
12
+ """
13
+ stop_on_failure: Incomplete
14
+ applicable_extensions: Incomplete
15
+ logger: Incomplete
16
+ def __init__(self, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
17
+ """Initialize the BaseValidator.
18
+
19
+ Args:
20
+ stop_on_failure (bool, optional): Whether to terminate the validation process if this validator fails.
21
+ Default is False.
22
+ applicable_extensions (list[str] | None, optional): The list of file extensions that this validator is
23
+ applicable to. Default is None which means all extensions are applicable.
24
+ """
25
+ def validate(self, file_validation_input: ValidatorInput) -> ValidatorResult:
26
+ """Validate the file against the validator's criteria.
27
+
28
+ Args:
29
+ file_validation_input (ValidatorInput): The ValidatorInput object to validate.
30
+
31
+ Returns:
32
+ ValidatorResult: A ValidatorResult object indicating success or failure
33
+ with an appropriate message.
34
+ """
@@ -0,0 +1,26 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.csv.pandas_loader import CSV_VARIANTS as CSV_VARIANTS
3
+ from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
4
+ from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
5
+ from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
6
+
7
+ class CharacterCountValidator(BaseValidator):
8
+ """Validator for checking if the total character length of file content does not exceed a maximum limit.
9
+
10
+ Character length counting is currently supported for:
11
+ - CSV files (csv, tsv, psv, ssv)
12
+ - TXT files
13
+ """
14
+ CHUNK_SIZE_BYTES: Incomplete
15
+ max_character_length: Incomplete
16
+ def __init__(self, max_character_length: int = 500000, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
17
+ """Initialize the CharacterCountValidator.
18
+
19
+ Args:
20
+ max_character_length (int, optional): The maximum allowed character length for the file.
21
+ Default is 500,000 characters. It should be greater than 0.
22
+ stop_on_failure (bool, optional): Whether to stop the validation process if this
23
+ validator fails. Default is False.
24
+ applicable_extensions (list[str] | None, optional): The list of file extensions that this validator
25
+ is applicable to. Default is None which means all extensions are applicable.
26
+ """
@@ -0,0 +1,20 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
3
+ from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
4
+ from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
5
+
6
+ class FileSizeValidator(BaseValidator):
7
+ """Validator for checking if file size does not exceed a maximum limit."""
8
+ max_file_size: Incomplete
9
+ def __init__(self, max_file_size: int = 10485760, stop_on_failure: bool = True, applicable_extensions: list[str] | None = None) -> None:
10
+ """Initialize the FileSizeValidator.
11
+
12
+ Args:
13
+ max_file_size (int, optional): The maximum allowed size for the file in bytes.
14
+ Default is 10,485,760 bytes (10MB). It should be greater than 0.
15
+ stop_on_failure (bool, optional): Whether to stop the validation process if this
16
+ validator fails. Default is True.
17
+ applicable_extensions (list[str] | None, optional): The list of file extensions
18
+ that this validator is applicable to. Default is None which means
19
+ all extensions are applicable.
20
+ """
@@ -0,0 +1,4 @@
1
+ from .validator_input import ValidatorInput as ValidatorInput
2
+ from .validator_result import ValidatorResult as ValidatorResult
3
+
4
+ __all__ = ['ValidatorInput', 'ValidatorResult']
@@ -0,0 +1,50 @@
1
+ from _typeshed import Incomplete
2
+ from pydantic import BaseModel
3
+ from types import TracebackType
4
+ from typing import BinaryIO
5
+
6
+ class ValidatorInput(BaseModel):
7
+ """File object used for validation.
8
+
9
+ Attributes:
10
+ name (str): File name (basename).
11
+ extension (str): File extension without leading dot, lowercased (e.g., 'pdf').
12
+ size (int): File size in bytes.
13
+ file (BinaryIO): Open binary file handle for content-based validations.
14
+ content_type (str | None): Optional content type (MIME), if known.
15
+ """
16
+ model_config: Incomplete
17
+ name: str
18
+ extension: str
19
+ size: int
20
+ file: BinaryIO
21
+ content_type: str | None
22
+ @classmethod
23
+ def from_path(cls, path: str) -> ValidatorInput:
24
+ """Create a ValidatorInput from a local path (opens in rb mode).
25
+
26
+ Args:
27
+ path (str): The file path to create ValidatorInput from.
28
+
29
+ Returns:
30
+ ValidatorInput: A ValidatorInput instance.
31
+ """
32
+ def close(self) -> None:
33
+ """Close the underlying file handle if owned by this object.
34
+
35
+ This method is idempotent and will not raise an error if the file is already closed.
36
+ """
37
+ def __enter__(self) -> ValidatorInput:
38
+ """Enter the runtime context related to this object.
39
+
40
+ Returns:
41
+ ValidatorInput: The ValidatorInput instance itself.
42
+ """
43
+ def __exit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None) -> None:
44
+ """Exit the runtime context and close the file handle if owned.
45
+
46
+ Args:
47
+ exc_type (type[BaseException] | None): The exception type, if any.
48
+ exc_val (BaseException | None): The exception value, if any.
49
+ exc_tb (TracebackType | None): The traceback, if any.
50
+ """
@@ -0,0 +1,19 @@
1
+ from pydantic import BaseModel
2
+ from typing import Any
3
+
4
+ class ValidatorResult(BaseModel):
5
+ """Represents the result of a validation operation.
6
+
7
+ This class encapsulates the result of a validation operation, including whether
8
+ the validation passed or failed and any associated message.
9
+
10
+ Attributes:
11
+ is_valid (bool): Whether the validation passed or failed.
12
+ source_validator (str): Validator class name that produced this result.
13
+ message (str): The message associated with the validation result.
14
+ params (dict[str, Any]): The parameters associated with the validation result.
15
+ """
16
+ is_valid: bool
17
+ source_validator: str
18
+ message: str
19
+ params: dict[str, Any]
@@ -0,0 +1,23 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
3
+ from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
4
+ from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
5
+
6
+ class PageCountValidator(BaseValidator):
7
+ """Validator for checking if the number of pages in a file does not exceed a maximum limit.
8
+
9
+ Page counting is currently supported for PDF files only.
10
+ """
11
+ max_pages: Incomplete
12
+ def __init__(self, max_pages: int = 100, stop_on_failure: bool = False, applicable_extensions: list[str] | None = None) -> None:
13
+ """Initialize the PageCountValidator.
14
+
15
+ Args:
16
+ max_pages (int, optional): The maximum allowed number of pages. A non-negative
17
+ value enforces a limit. Default is 100 pages. It should be greater than 0.
18
+ stop_on_failure (bool, optional): Whether to stop the validation process if this
19
+ validator fails. Default is False.
20
+ applicable_extensions (list[str] | None, optional): The list of file extensions
21
+ that this validator is applicable to. Default is None which means
22
+ all extensions are applicable.
23
+ """