gllm-docproc-binary 0.1.8__cp312-cp312-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (123) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +29 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +16 -0
  11. gllm_docproc/data_generator/__init__.pyi +3 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +19 -0
  13. gllm_docproc/downloader/__init__.pyi +3 -0
  14. gllm_docproc/downloader/base_downloader.pyi +16 -0
  15. gllm_docproc/downloader/html/__init__.pyi +4 -0
  16. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  17. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  18. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  19. gllm_docproc/downloader/html/html_downloader.pyi +91 -0
  20. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  21. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  22. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  23. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  24. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
  25. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  26. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
  27. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  28. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  29. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  30. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
  31. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  32. gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
  33. gllm_docproc/dpo_router/__init__.pyi +3 -0
  34. gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
  35. gllm_docproc/housekeeping/__init__.pyi +3 -0
  36. gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
  37. gllm_docproc/indexer/__init__.pyi +3 -0
  38. gllm_docproc/indexer/base_indexer.pyi +31 -0
  39. gllm_docproc/indexer/graph/__init__.pyi +3 -0
  40. gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
  41. gllm_docproc/loader/__init__.pyi +4 -0
  42. gllm_docproc/loader/audio/__init__.pyi +3 -0
  43. gllm_docproc/loader/base_loader.pyi +31 -0
  44. gllm_docproc/loader/docx/__init__.pyi +5 -0
  45. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  46. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  47. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  48. gllm_docproc/loader/exception/__init__.pyi +3 -0
  49. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  50. gllm_docproc/loader/html/__init__.pyi +5 -0
  51. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  52. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  53. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  54. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
  55. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  56. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  57. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  58. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  59. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  60. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  61. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  62. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  63. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  64. gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
  65. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  66. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  67. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  68. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  69. gllm_docproc/loader/json/__init__.pyi +3 -0
  70. gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
  71. gllm_docproc/loader/loader_utils.pyi +42 -0
  72. gllm_docproc/loader/pdf/__init__.pyi +13 -0
  73. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
  74. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
  75. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
  76. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  77. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  78. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  79. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  80. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
  81. gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
  82. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
  83. gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
  84. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  85. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  86. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  87. gllm_docproc/loader/txt/__init__.pyi +3 -0
  88. gllm_docproc/loader/txt/txt_loader.pyi +26 -0
  89. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  90. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
  91. gllm_docproc/model/__init__.pyi +4 -0
  92. gllm_docproc/model/element.pyi +37 -0
  93. gllm_docproc/model/element_metadata.pyi +35 -0
  94. gllm_docproc/parser/__init__.pyi +4 -0
  95. gllm_docproc/parser/base_parser.pyi +29 -0
  96. gllm_docproc/parser/document/__init__.pyi +6 -0
  97. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  98. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  99. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  100. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  101. gllm_docproc/parser/html/__init__.pyi +4 -0
  102. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  103. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  104. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  105. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  106. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  107. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  108. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  109. gllm_docproc/parser/table/__init__.pyi +3 -0
  110. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  111. gllm_docproc/request_handler/__init__.pyi +3 -0
  112. gllm_docproc/request_handler/base_request_handler.pyi +17 -0
  113. gllm_docproc/response_handler/__init__.pyi +3 -0
  114. gllm_docproc/response_handler/base_response_handler.pyi +39 -0
  115. gllm_docproc/utils/__init__.pyi +0 -0
  116. gllm_docproc/utils/file_utils.pyi +76 -0
  117. gllm_docproc/utils/html_constants.pyi +121 -0
  118. gllm_docproc.build/.gitignore +1 -0
  119. gllm_docproc.cpython-312-darwin.so +0 -0
  120. gllm_docproc.pyi +149 -0
  121. gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
  122. gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
  123. gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
@@ -0,0 +1,35 @@
1
+ from pydantic import BaseModel
2
+
3
+ PDF: str
4
+ DOCX: str
5
+ XLSX: str
6
+ PPTX: str
7
+ CSV: str
8
+ TXT: str
9
+ HTML: str
10
+ AUDIO: str
11
+ IMAGE: str
12
+ VIDEO: str
13
+
14
+ class ElementMetadata(BaseModel):
15
+ """Element metadata model.
16
+
17
+ This class serves as the Element metadata model for storing element metadata.
18
+
19
+ Mandatory Attributes:
20
+ source (str): The source of the element.
21
+ source_type (str): The source type of the element.
22
+ loaded_datetime (datetime): The datetime when the element is loaded.
23
+ """
24
+ source: str
25
+ source_type: str
26
+ loaded_datetime: str
27
+ class Config:
28
+ """Pydantic model configuration.
29
+
30
+ This class defines the Pydantic model configuration for the ElementMetadata model.
31
+
32
+ Attributes:
33
+ extra (str): Allow extra fields.
34
+ """
35
+ extra: str
@@ -0,0 +1,4 @@
1
+ from .base_parser import BaseParser as BaseParser
2
+ from .pipeline_parser import PipelineParser as PipelineParser
3
+
4
+ __all__ = ['BaseParser', 'PipelineParser']
@@ -0,0 +1,29 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseParser(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for document parser.
7
+
8
+ This class serves as the base for document parser, which will define the structure for every
9
+ content of document.
10
+
11
+ Methods:
12
+ parse(loaded_elements, **kwargs): Abstract method to parse a document.
13
+ """
14
+ @abstractmethod
15
+ def parse(self, loaded_elements: Any, **kwargs: Any) -> Any:
16
+ """Parse loaded elements to get element structure.
17
+
18
+ This method is abstract and must be implemented in subclasses.
19
+ It defines the process of parsing a document using loaded elements.
20
+
21
+ Args:
22
+ loaded_elements (Any): The loaded elements from loader. ideally formatted as List[Dict].
23
+ **kwargs (Any): Additional keyword arguments for customization.
24
+
25
+ Returns:
26
+ Any: The parsed document, ideally formatted as List[Dict]. Each dictionary within
27
+ the list are recommended to follows the structure of model 'Element',
28
+ to ensure consistency and ease of use across Document Processing Orchestrator.
29
+ """
@@ -0,0 +1,6 @@
1
+ from .docx_parser import DOCXParser as DOCXParser
2
+ from .pdf_parser import PDFParser as PDFParser
3
+ from .txt_parser import TXTParser as TXTParser
4
+ from .xlsx_parser import XLSXParser as XLSXParser
5
+
6
+ __all__ = ['DOCXParser', 'PDFParser', 'XLSXParser', 'TXTParser']
@@ -0,0 +1,27 @@
1
+ from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, HEADING as HEADING, MAX_HEADING_LEVEL as MAX_HEADING_LEVEL, PARAGRAPH as PARAGRAPH, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
2
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
3
+ from typing import Any
4
+
5
+ class DOCXParser(BaseParser):
6
+ """A DOCX parser for parsing DOCX document text structure.
7
+
8
+ This class serves as the DOCX parser for parsing DOCX document text structure.
9
+ It defines the structure for parsing DOCX document text structure from a given loaded_elements.
10
+
11
+ Methods:
12
+ parse(loaded_elements, **kwargs): Parse the document from the loaded elements.
13
+ """
14
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
15
+ """Parse the document from the loaded elements.
16
+
17
+ This method defines the process of defining text structure from loaded_elements (DOCX Loader output)
18
+ by their style_name. In cases there's customized style_name, it will be categorized as paragraph.
19
+ (example: 'Heading', 'Heading Body', 'Title 1', will be categorized as paragraph.)
20
+
21
+ Args:
22
+ loaded_elements (list[dict[str, Any]]): A list of loaded elements containing text content and metadata.
23
+ **kwargs (Any): Additional keyword arguments for parsing the document.
24
+
25
+ Returns:
26
+ list[dict[str, Any]]: A list of parsed elements containing text content and metadata.
27
+ """
@@ -0,0 +1,35 @@
1
+ from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, FOOTNOTE as FOOTNOTE, HEADER as HEADER, HEADING as HEADING, PARAGRAPH as PARAGRAPH, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
2
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
3
+ from typing import Any
4
+
5
+ HEADER_THRESHOLD_POSITION: int
6
+ FOOTER_THRESHOLD_POSITION: int
7
+ FOOTNOTE_POSITION_RATIO: float
8
+
9
+ class PDFParser(BaseParser):
10
+ """A class to parse PDF documents.
11
+
12
+ This class serves as a PDF parser for parsing or defining the structure of text within PDF documents
13
+ based on the text metadata (font size, font family, coordinates, etc.).
14
+
15
+ Methods:
16
+ parse: Parse the loaded elements.
17
+ """
18
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
19
+ """Parse the loaded elements.
20
+
21
+ This method defines the process of defining text structure of the loaded elements based on metadata
22
+ for PDF loaded elements.
23
+
24
+ Args:
25
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded element
26
+ content and metadata.
27
+ **kwargs (Any): Additional keyword arguments.
28
+
29
+ Kwargs:
30
+ header_footer_tolerance (int, optional): An integer value indicating the tolerance for header and footer.
31
+ Defaults to 0.
32
+
33
+ Returns:
34
+ list[dict[str, Any]]: A list of dictionaries containing parsed element content and metadata.
35
+ """
@@ -0,0 +1,22 @@
1
+ from gllm_docproc.model.element import Element as Element, PARAGRAPH as PARAGRAPH, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
2
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
3
+ from typing import Any
4
+
5
+ class TXTParser(BaseParser):
6
+ """TXT parser for parsing text files.
7
+
8
+ Methods:
9
+ parse: Parse a list of elements from a text file.
10
+ """
11
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
12
+ """Parse a list of elements from a text file.
13
+
14
+ all elements with structure UNCATEGORIZED_TEXT will be converted to PARAGRAPH
15
+
16
+ Args:
17
+ loaded_elements (list[dict[str, Any]]): The list of elements that have already been loaded.
18
+ **kwargs: Additional keyword arguments.
19
+
20
+ Returns:
21
+ list[dict[str, Any]]: A list of elements.
22
+ """
@@ -0,0 +1,26 @@
1
+ from gllm_docproc.model.element import Element as Element
2
+ from gllm_docproc.parser import BaseParser as BaseParser
3
+ from typing import Any
4
+
5
+ DEFAULT_SHEET_NAME_PATTERN: str
6
+
7
+ class XLSXParser(BaseParser):
8
+ """A XLSX parser for parsing XLSX document text structure.
9
+
10
+ This class serves as the XLSX parser for parsing XLSX document text structure.
11
+ It defines the structure for parsing XLSX document text structure from a given loaded_elements.
12
+
13
+ Methods:
14
+ parse(loaded_elements, **kwargs): Parse the document from the loaded elements.
15
+ """
16
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
17
+ """Parse loaded elements by assigning a structure to each element.
18
+
19
+ Args:
20
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries representing loaded elements.
21
+ **kwargs (Any): Additional arguments for parsing the document.
22
+
23
+ Returns:
24
+ list[dict[str, Any]]: A list of dictionaries containing parsed elements with assigned structures.
25
+
26
+ """
@@ -0,0 +1,4 @@
1
+ from .flat.html_flat_parser import HTMLFlatParser as HTMLFlatParser
2
+ from .nested.html_nested_parser import HTMLNestedParser as HTMLNestedParser
3
+
4
+ __all__ = ['HTMLFlatParser', 'HTMLNestedParser']
File without changes
@@ -0,0 +1,27 @@
1
+ from gllm_docproc.model.element import PARAGRAPH as PARAGRAPH
2
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
3
+ from gllm_docproc.utils.html_constants import ItemDataKeys as ItemDataKeys, Structure as Structure
4
+ from typing import Any
5
+
6
+ class HTMLFlatParser(BaseParser):
7
+ """This class extends the BaseParser and is specifically designed for parsing elements loaded from web content.
8
+
9
+ It assigns a structure to each loaded element based on the HTML tags present in its metadata.
10
+
11
+ Attributes:
12
+ None
13
+
14
+ Methods:
15
+ parse(loaded_elements: list[dict], **kwargs: dict[str, Any]) -> list[dict]:
16
+ Parses the loaded_elements and assigns a structure to each element based on its HTML tags.
17
+ """
18
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: dict[str, Any]) -> list[dict[str, Any]]:
19
+ """Parses the loaded_elements and assigns a structure to each element based on its HTML tags.
20
+
21
+ Args:
22
+ loaded_elements (list[dict]): The elements loaded from web content to be parsed.
23
+ **kwargs (dict[str, Any]): Additional keyword arguments.
24
+
25
+ Returns:
26
+ list[dict]: The parsed elements with assigned structures.
27
+ """
File without changes
@@ -0,0 +1,158 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import PARAGRAPH as PARAGRAPH, TABLE as TABLE, TITLE as TITLE
3
+ from gllm_docproc.parser.html.nested.nested_element import NestedElement as NestedElement
4
+ from gllm_docproc.utils.html_constants import ContentDataKeys as ContentDataKeys, ErrorMessage as ErrorMessage, FORMATTING_TAGS as FORMATTING_TAGS, HTMLTags as HTMLTags, ItemDataKeys as ItemDataKeys, MetaDataKeys as MetaDataKeys, SPACING as SPACING, Structure as Structure, TableConstants as TableConstants
5
+
6
+ class HTMLJsonProcessor:
7
+ """Processor for processing items scraped by the spider.
8
+
9
+ This pipeline processes the raw data scraped by the spider, formats it, and stores it in a JSON format.
10
+ It also handles errors during the processing and logging of the data.
11
+
12
+ Attributes:
13
+ logger: An instance of a logger, used for logging runtime information.
14
+ element_id: A counter for the elements processed by the pipeSline.
15
+ processor_result: A dictionary that holds the processed data.
16
+ """
17
+ logger: Incomplete
18
+ element_id: int
19
+ processor_result: Incomplete
20
+ def __init__(self) -> None:
21
+ """Initialize the HTMLJsonProcessor."""
22
+ def process_item(self, item: list[dict]):
23
+ """Processes each item passed by the spider.
24
+
25
+ The method formats the raw data and stores it in the processor_result dictionary.
26
+
27
+ Args:
28
+ item (list): The raw data scraped by the spider.
29
+
30
+ Returns:
31
+ list: The processed item.
32
+ """
33
+ def add_title_element(self, item) -> None:
34
+ """Adds the title element to the processor_result dictionary.
35
+
36
+ Args:
37
+ item (dict): The raw data scraped by the spider.
38
+ """
39
+ def extract_data(self, current: dict, data: NestedElement):
40
+ """Extracts data from the raw data.
41
+
42
+ This method traverses the raw data and extracts the necessary information.
43
+
44
+ Args:
45
+ current (dict): The current node in the raw data.
46
+ data (NestedElement): The dictionary where the extracted data is stored.
47
+ """
48
+ def handle_table_data(self, current, data: NestedElement):
49
+ """Handles table content.
50
+
51
+ Args:
52
+ current (dict): The current node in the raw data. It should contain the table content and metadata.
53
+ data (dict): The dictionary where the extracted data is stored.
54
+ """
55
+ def handle_media_data(self, current, data: NestedElement):
56
+ """Handles media content.
57
+
58
+ Args:
59
+ current (dict): The current node in the raw data.
60
+ data (dict): The dictionary where the extracted data is stored.
61
+ """
62
+ def handle_string_content(self, current, data: NestedElement):
63
+ """Handles string content.
64
+
65
+ Args:
66
+ current (dict): The current node in the raw data.
67
+ data (dict): The dictionary where the extracted data is stored.
68
+ """
69
+ def handle_other_cases(self, current, data: NestedElement):
70
+ """Handles other cases.
71
+
72
+ Args:
73
+ current (dict): The current node in the raw data.
74
+ data (dict): The dictionary where the extracted data is stored.
75
+ """
76
+ def handle_current_tag(self, current, data: NestedElement) -> tuple[NestedElement, dict]:
77
+ """Handles the current tag. This method checks the current tag and updates the data accordingly.
78
+
79
+ Args:
80
+ current (dict): The current node in the raw data.
81
+ data (dict): The dictionary where the extracted data is stored.
82
+
83
+ Returns:
84
+ NestedElement: The updated NestedElement object.
85
+ dict: A dictionary containing additional arguments.
86
+ """
87
+ def handle_content(self, current, data: NestedElement, args: dict):
88
+ """Handles content. This method iterates over the content and extracts the necessary information.
89
+
90
+ Args:
91
+ current (dict): The current node in the raw data.
92
+ data (NestedElement): The dictionary where the extracted data is stored.
93
+ args (dict): The dictionary containing the arguments for the method.
94
+ """
95
+ def add_result(self, data: NestedElement):
96
+ """Adds the processed data to the processor_result dictionary.
97
+
98
+ Args:
99
+ data (dict): The processed data.
100
+ """
101
+ def add_link(self, data: NestedElement) -> NestedElement:
102
+ """Adds a link to the processed data content.
103
+
104
+ Args:
105
+ data (dict): The processed data.
106
+
107
+ Returns:
108
+ dict: The processed data.
109
+ """
110
+ def add_index(self, data: NestedElement) -> NestedElement:
111
+ """Adds a index to the processed data content.
112
+
113
+ Args:
114
+ data (dict): The processed data.
115
+
116
+ Returns:
117
+ dict: The processed data.
118
+ """
119
+ def handle_media(self, current, data: NestedElement) -> NestedElement:
120
+ """Handles media content.
121
+
122
+ Args:
123
+ current (dict): The current node in the raw data.
124
+ data (dict): The dictionary where the extracted data is stored.
125
+
126
+ Returns:
127
+ dict: The processed data.
128
+ """
129
+ def handle_table(self, current, data: NestedElement) -> list:
130
+ """Handle Table.
131
+
132
+ This method processes table content by iterating over its metadata, handling each row based on its type,
133
+ and appending the result to the table data.
134
+
135
+ Args:
136
+ current (dict): The current node in the raw data. It should contain the table content and metadata.
137
+ data (dict): The dictionary where the extracted data is stored. This method adds a 'structure' key with the
138
+ value 'table', and appends the extracted table data to this dictionary.
139
+
140
+ Returns:
141
+ list: A list of dictionaries containing the extracted table data.
142
+ """
143
+ def print_row(self, row, col_size: Incomplete | None = None):
144
+ """Formats a table row.
145
+
146
+ Args:
147
+ row (list): The row to be formatted.
148
+ col_size (list | None, optional): The size of the columns. Defaults to None.
149
+
150
+ Returns:
151
+ str: The formatted row.
152
+ """
153
+ def print_table_separator(self, row):
154
+ """Formats a table separator.
155
+
156
+ Returns:
157
+ str: The formatted table separator.
158
+ """
@@ -0,0 +1,24 @@
1
+ from gllm_docproc.model.element import Element as Element
2
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
3
+ from gllm_docproc.parser.html.nested.html_json_processor import HTMLJsonProcessor as HTMLJsonProcessor
4
+ from typing import Any
5
+
6
+ class HTMLNestedParser(BaseParser):
7
+ """A parser class for processing JSON elements into a parsed elements.
8
+
9
+ This class inherits from the BaseParser class and implements the parse method
10
+ to convert loaded HTML elements into a processed JSON format.
11
+
12
+ Attributes:
13
+ None
14
+ """
15
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: dict[str, Any]) -> list[dict[str, Any]]:
16
+ """Processes loaded HTML elements into a JSON format.
17
+
18
+ Args:
19
+ loaded_elements (dict): The loaded HTML elements to be processed.
20
+ **kwargs (dict[str, Any]): Additional keyword arguments.
21
+
22
+ Returns:
23
+ dict: The processed JSON representation of the HTML elements.
24
+ """
@@ -0,0 +1,31 @@
1
+ from gllm_docproc.model.element import Element as Element
2
+
3
+ class NestedElement(Element):
4
+ """A specialized class extending Element to represent nested elements.
5
+
6
+ This class includes additional functionality specific to nested elements, such as generating
7
+ a unique element_id and providing methods to convert the instance to a dictionary or Element.
8
+
9
+ Attributes:
10
+ element_id (int): A unique identifier for the nested element.
11
+
12
+ Methods:
13
+ to_dict(): Convert the NestedElement instance to a dictionary.
14
+ to_element(): Convert the NestedElement instance to an Element.
15
+ """
16
+ element_id: int
17
+ def to_dict(self):
18
+ """Convert the NestedElement instance to a dictionary.
19
+
20
+ Returns:
21
+ dict: A dictionary representation of the NestedElement instance.
22
+ """
23
+ def to_element(self) -> Element:
24
+ """Convert the NestedElement instance to an Element.
25
+
26
+ This method creates an Element instance from the current NestedElement. It deep copies the metadata,
27
+ assigns the element_id, and constructs an Element with the associated text, metadata, and structure.
28
+
29
+ Returns:
30
+ Element: The Element instance created from the NestedElement.
31
+ """
@@ -0,0 +1,33 @@
1
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
2
+ from typing import Any
3
+
4
+ class PipelineParser:
5
+ """Pipeline parser for parsing documents.
6
+
7
+ This class serves as the pipeline parser for parsing documents. It defines the structure for
8
+ parsing documents with several parsers using pipeline.
9
+
10
+ Methods:
11
+ add_parser(parser): Add parser to the pipeline parser.
12
+ parse(elements, **kwargs): Parse the elements using parsers.
13
+ """
14
+ parsers: list[BaseParser]
15
+ def __init__(self) -> None:
16
+ """Initialize the pipeline parser."""
17
+ def add_parser(self, parser: BaseParser):
18
+ """Add parser to the pipeline parser.
19
+
20
+ This method defines the process of adding parser to the pipeline parser.
21
+
22
+ Args:
23
+ parser (BaseParser): The parser to be added.
24
+ """
25
+ def parse(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
26
+ """Parse the elements using pipeline parser.
27
+
28
+ This method defines the process of parsing the elements using parsers.
29
+
30
+ Args:
31
+ elements (list[dict[str, Any]]): A list of dictionaries containing elements.
32
+ **kwargs (Any): Additional keyword arguments.
33
+ """
@@ -0,0 +1,3 @@
1
+ from .table_caption_parser import TableCaptionParser as TableCaptionParser
2
+
3
+ __all__ = ['TableCaptionParser']
@@ -0,0 +1,66 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element import Element as Element, HEADING as HEADING, PARAGRAPH as PARAGRAPH, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
3
+ from gllm_docproc.parser.base_parser import BaseParser as BaseParser
4
+ from typing import Any
5
+
6
+ TABLE_AND_CAPTION_STRUCTURE: Incomplete
7
+ UPPER_ELEMENT_IS_CAPTION: str
8
+ LOWER_ELEMENT_IS_CAPTION: str
9
+ MAX_CAPTION_LENGTH: str
10
+ REMOVE_CAPTION_FROM_ELEMENT: str
11
+ MAX_CAPTION_ELEMENTS: str
12
+ UPPER_CAPTION_EXTRACTOR: str
13
+ LOWER_CAPTION_EXTRACTOR: str
14
+
15
+ def curry_upper_caption_extractor(remove_caption_from_element: bool):
16
+ """Curry Upper Caption Extractor.
17
+
18
+ This function curries the extract_upper_caption function with the remove_caption_from_element parameters.
19
+
20
+ Why we need to use currying?
21
+ 1. so user can customize the upper_caption_extractor function
22
+ 2. the customize upper_caption_extractor may not require the remove_caption_from_element parameter
23
+
24
+ Args:
25
+ remove_caption_from_element (bool): A boolean value to remove the caption from the element.
26
+
27
+ Returns:
28
+ function: The function to extract the upper caption.
29
+ """
30
+ def curry_lower_caption_extractor(remove_caption_from_element: bool):
31
+ """Curry Lower Caption Extractor.
32
+
33
+ This function curries the extract_lower_caption function with the remove_caption_from_element parameters.
34
+
35
+ Why we need to use currying?
36
+ 1. so user can customize the lower_caption_extractor function
37
+ 2. the customize lower_caption_extractor may not require the remove_caption_from_element parameter
38
+
39
+ Args:
40
+ remove_caption_from_element (bool): A boolean value to remove the caption from the element.
41
+
42
+ Returns:
43
+ function: The function to extract the lower caption.
44
+ """
45
+
46
+ class TableCaptionParser(BaseParser):
47
+ """TableCaptionParser class.
48
+
49
+ A class to extract table captions from a document and add them to the metadata of the table element.
50
+
51
+ Methods:
52
+ parse(loaded_elements, **kwargs): Extract table captions from a document and add them to
53
+ the metadata of the table element.
54
+ """
55
+ def parse(self, loaded_elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
56
+ """Parses the elements to extract table captions.
57
+
58
+ This method extracts table captions from the elements and adds them to the metadata of the table element.
59
+
60
+ Args:
61
+ loaded_elements (list[dict[str, Any]]): The elements to extract table captions from.
62
+ **kwargs (Any): Additional keyword arguments for customization.
63
+
64
+ Returns:
65
+ list[dict[str, Any]]: The elements with the table captions added to the metadata.
66
+ """
@@ -0,0 +1,3 @@
1
+ from .base_request_handler import BaseRequestHandler as BaseRequestHandler
2
+
3
+ __all__ = ['BaseRequestHandler']
@@ -0,0 +1,17 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseRequestHandler(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for request handler."""
7
+ @abstractmethod
8
+ def handle_request(self, **kwargs: Any) -> None:
9
+ """Handles a request.
10
+
11
+ Args:
12
+ **kwargs (Any): Arbitrary keyword arguments.
13
+ The implementing class is responsible to define the arguments
14
+
15
+ Returns:
16
+ None
17
+ """
@@ -0,0 +1,3 @@
1
+ from .base_response_handler import BaseResponseHandler as BaseResponseHandler
2
+
3
+ __all__ = ['BaseResponseHandler']
@@ -0,0 +1,39 @@
1
+ import abc
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ class BaseResponseHandler(ABC, metaclass=abc.ABCMeta):
6
+ """Base class for document converter."""
7
+ @abstractmethod
8
+ def handle_success_response(self, **kwargs: Any) -> None:
9
+ """Handles a success response (successfully indexed).
10
+
11
+ Args:
12
+ **kwargs (Any): Arbitrary keyword arguments.
13
+ The implementing class is responsible to define the arguments
14
+
15
+ Returns:
16
+ None
17
+ """
18
+ @abstractmethod
19
+ def handle_deleted_response(self, **kwargs: Any) -> None:
20
+ """Handles a deleted response (successfully deleted).
21
+
22
+ Args:
23
+ **kwargs (Any): Arbitrary keyword arguments.
24
+ The implementing class is responsible to define the arguments
25
+
26
+ Returns:
27
+ None
28
+ """
29
+ @abstractmethod
30
+ def handle_failed_response(self, **kwargs: Any) -> None:
31
+ """Handles a failed response (either failed to index or failed to delete).
32
+
33
+ Args:
34
+ **kwargs (Any): Arbitrary keyword arguments.
35
+ The implementing class is responsible to define the arguments
36
+
37
+ Returns:
38
+ None
39
+ """
File without changes