gllm-docproc-binary 0.1.8__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (123) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +29 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +16 -0
  11. gllm_docproc/data_generator/__init__.pyi +3 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +19 -0
  13. gllm_docproc/downloader/__init__.pyi +3 -0
  14. gllm_docproc/downloader/base_downloader.pyi +16 -0
  15. gllm_docproc/downloader/html/__init__.pyi +4 -0
  16. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  17. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  18. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  19. gllm_docproc/downloader/html/html_downloader.pyi +91 -0
  20. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  21. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  22. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  23. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  24. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
  25. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  26. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
  27. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  28. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  29. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  30. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
  31. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  32. gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
  33. gllm_docproc/dpo_router/__init__.pyi +3 -0
  34. gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
  35. gllm_docproc/housekeeping/__init__.pyi +3 -0
  36. gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
  37. gllm_docproc/indexer/__init__.pyi +3 -0
  38. gllm_docproc/indexer/base_indexer.pyi +31 -0
  39. gllm_docproc/indexer/graph/__init__.pyi +3 -0
  40. gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
  41. gllm_docproc/loader/__init__.pyi +4 -0
  42. gllm_docproc/loader/audio/__init__.pyi +3 -0
  43. gllm_docproc/loader/base_loader.pyi +31 -0
  44. gllm_docproc/loader/docx/__init__.pyi +5 -0
  45. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  46. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  47. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  48. gllm_docproc/loader/exception/__init__.pyi +3 -0
  49. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  50. gllm_docproc/loader/html/__init__.pyi +5 -0
  51. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  52. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  53. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  54. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
  55. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  56. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  57. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  58. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  59. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  60. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  61. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  62. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  63. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  64. gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
  65. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  66. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  67. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  68. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  69. gllm_docproc/loader/json/__init__.pyi +3 -0
  70. gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
  71. gllm_docproc/loader/loader_utils.pyi +42 -0
  72. gllm_docproc/loader/pdf/__init__.pyi +13 -0
  73. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
  74. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
  75. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
  76. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  77. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  78. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  79. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  80. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
  81. gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
  82. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
  83. gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
  84. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  85. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  86. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  87. gllm_docproc/loader/txt/__init__.pyi +3 -0
  88. gllm_docproc/loader/txt/txt_loader.pyi +26 -0
  89. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  90. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
  91. gllm_docproc/model/__init__.pyi +4 -0
  92. gllm_docproc/model/element.pyi +37 -0
  93. gllm_docproc/model/element_metadata.pyi +35 -0
  94. gllm_docproc/parser/__init__.pyi +4 -0
  95. gllm_docproc/parser/base_parser.pyi +29 -0
  96. gllm_docproc/parser/document/__init__.pyi +6 -0
  97. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  98. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  99. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  100. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  101. gllm_docproc/parser/html/__init__.pyi +4 -0
  102. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  103. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  104. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  105. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  106. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  107. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  108. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  109. gllm_docproc/parser/table/__init__.pyi +3 -0
  110. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  111. gllm_docproc/request_handler/__init__.pyi +3 -0
  112. gllm_docproc/request_handler/base_request_handler.pyi +17 -0
  113. gllm_docproc/response_handler/__init__.pyi +3 -0
  114. gllm_docproc/response_handler/base_response_handler.pyi +39 -0
  115. gllm_docproc/utils/__init__.pyi +0 -0
  116. gllm_docproc/utils/file_utils.pyi +76 -0
  117. gllm_docproc/utils/html_constants.pyi +121 -0
  118. gllm_docproc.build/.gitignore +1 -0
  119. gllm_docproc.cp311-win_amd64.pyd +0 -0
  120. gllm_docproc.pyi +149 -0
  121. gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
  122. gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
  123. gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
@@ -0,0 +1,50 @@
1
+ from _typeshed import Incomplete
2
+ from azure.ai.documentintelligence.models import AnalyzeResult as AnalyzeResult
3
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
4
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
5
+ from typing import Any
6
+
7
+ class AzureAIDocumentIntelligenceRawLoader(BaseLoader):
8
+ """Azure AI Document Intelligence Raw Loader class.
9
+
10
+ This class provides a loader for extracting text, tables, and images from PDF files
11
+ using the Azure AI Document Intelligence API. It implements the 'load' method to handle document
12
+ loading from a given source.
13
+
14
+ Methods:
15
+ load(source, loaded_elements, **kwargs): Load and process a document.
16
+ """
17
+ endpoint: Incomplete
18
+ key: Incomplete
19
+ def __init__(self, endpoint: str, key: str) -> None:
20
+ """Initializes the AzureAILoader class.
21
+
22
+ Args:
23
+ endpoint (str): The endpoint for the Azure AI Document Intelligence API.
24
+ key (str): The key for the Azure AI Document Intelligence API.
25
+ """
26
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
27
+ '''Load and process a document using the Azure AI Document Intelligence API.
28
+
29
+ This method sends a request to the Azure AI Document Intelligence API to extract information
30
+ from a PDF file. It returns the extracted information in a dictionary format, without any
31
+ additional processing.
32
+
33
+ Kwargs:
34
+ model_id (str, optional): The model used for document analysis. Azure AI Document Intelligence API
35
+ provides several prebuilt models for document analysis, check for available models at:
36
+ https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview?
37
+ view=doc-intel-4.0.0#model-analysis-features
38
+ Defaults to "prebuilt-layout".
39
+ features (list[str], optional): The add-on capabilities. Document Intelligence supports more sophisticated
40
+ analysis capabilities. These optional features can be enabled and disabled depending on the scenario
41
+ of the document extraction. Check for available add on capabilities at:
42
+ https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?
43
+ view=azure-python-preview#add-on-capabilities
44
+ Defaults to an empty list.
45
+
46
+ Args:
47
+ source (str): The source of the document to be processed.
48
+ loaded_elements (Any): A list of loaded elements to be processed.
49
+ **kwargs (Any): Additional keyword arguments.
50
+ '''
@@ -0,0 +1,38 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
4
+ from typing import Any
5
+
6
+ class GLAIRVisionOCRLoader(BaseLoader):
7
+ """GLAIR Vision OCR Loader class.
8
+
9
+ This class provides a loader for extracting text and table from PDF file using the GLAIR Vision OCR API.
10
+ It implements the 'load' method to handle document loading from a given source.
11
+
12
+ Methods:
13
+ load(source, loaded_elements, **kwargs): Load and process a document.
14
+ """
15
+ username: Incomplete
16
+ password: Incomplete
17
+ api_key: Incomplete
18
+ def __init__(self, username: str, password: str, api_key: str) -> None:
19
+ """Initializes the GLAIRVisionOCRLoader class.
20
+
21
+ Args:
22
+ username (str): The username for the GLAIR Vision OCR API.
23
+ password (str): The password for the GLAIR Vision OCR API.
24
+ api_key (str): The API key for the GLAIR Vision OCR API.
25
+ """
26
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
27
+ """Load and process a document using the GLAIR Vision OCR API.
28
+
29
+ This method loads a PDF document from a given source and extracts text and table using the GLAIR Vision OCR API.
30
+
31
+ Args:
32
+ source (str): The source of the document to be processed.
33
+ loaded_elements (Any): The loaded elements from previous loaders.
34
+ **kwargs (Any): Additional keyword arguments for customization.
35
+
36
+ Returns:
37
+ dict: The OCR response from the GLAIR Vision OCR API.
38
+ """
@@ -0,0 +1,59 @@
1
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
2
+ from typing import Any
3
+
4
+ def merge_loaded_elements_by_coordinates(loaded_elements: list[Element], existing_loaded_elements: list[Element], **kwargs: Any) -> list[Element]:
5
+ """Merge the loaded elements by coordinates.
6
+
7
+ This function merges elements from 'loaded_elements' into 'existing_loaded_elements' based on
8
+ coordinates. The 'loaded_elements' inside the 'existing_loaded_elements' (eg. table) will be
9
+ duplicated information and will not be included in the merged list.
10
+
11
+ Args:
12
+ loaded_elements (List[Element]): A list of Elements containing loaded element content.
13
+ existing_loaded_elements (List[Element]): A list of existing Elements.
14
+ kwargs (Any): Additional keyword arguments for merging the loaded elements.
15
+
16
+ Kwargs:
17
+ is_object_inside_box_threshold (float, optional): The threshold of the intersection area to the area
18
+ of the object. Defaults to 1.
19
+ merge_element_with_duplicates (Callable[[Element, List[Element]], Element], optional): The function
20
+ to merge the new element with the duplicate elements. Defaults to _merge_element_with_duplicates.
21
+
22
+ Returns:
23
+ list[Element]: A list of Element containing merged loaded element content.
24
+ """
25
+ def bbox_to_coordinates(bbox: list[float]) -> list[int]:
26
+ """Convert the bounding box to coordinates.
27
+
28
+ This method converts the bounding box to coordinates.
29
+
30
+ Args:
31
+ bbox (list[float]): The bounding box.
32
+
33
+ Returns:
34
+ list[int]: The coordinates.
35
+ """
36
+ def is_object_inside_box(object_coordinates: list[int], box_coordinates: list[int], threshold: float = 1) -> bool:
37
+ """Validate is object coordinates position inside the box.
38
+
39
+ Args:
40
+ object_coordinates (list[int]): The coordinates position of the object.
41
+ box_coordinates (list[int]): The coordinates position of the box.
42
+ threshold (float): The threshold of the intersection area to the area of the object.
43
+
44
+ Returns:
45
+ bool: True if the object coordinates position inside the box.
46
+ """
47
+ def calculate_object_intersection_over_box_area(object_coordinates: list[int], box_coordinates: list[int]) -> float:
48
+ """Calculate the ratio of the intersection area of an object to the area of a bounding box.
49
+
50
+ This function computes the area of intersection between the given object coordinates and box coordinates,
51
+ and then calculates the ratio of this intersection area to the area of the object.
52
+
53
+ Args:
54
+ object_coordinates (list[int]): The coordinates of the object in the format [left, right, bottom, top].
55
+ box_coordinates (list[int]): The coordinates of the bounding box in the format [left, right, bottom, top].
56
+
57
+ Returns:
58
+ float: The ratio of the intersection area to the area of the object. Returns 0 if there is no intersection.
59
+ """
@@ -0,0 +1,38 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
5
+ from typing import Any
6
+
7
+ class PDFMinerLoader(BaseLoader):
8
+ """A class for loading and processing PDF document using PDFMiner.
9
+
10
+ This class defines the structure for loading and processing PDF document to retrieve required values
11
+ (text and metadata). It implements the 'load' method to handle PDF loading from a given file path.
12
+
13
+ PDFMinerLoader is used to extract the TEXT and metadata from the PDF document.
14
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
15
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
16
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
17
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
18
+
19
+ Methods:
20
+ load(source, loaded_elements, **kwargs): Load a PDF document.
21
+ """
22
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
23
+ """Load and process a PDF document specified by the file path and name (source).
24
+
25
+ This method defines the process of loading a PDF document using its file path.
26
+ It uses PDFMiner to extract element text and element metadata from the PDF document.
27
+
28
+ Args:
29
+ source (str): The path to the PDF document file.
30
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
31
+ **kwargs (Any): Additional keyword arguments for the loader.
32
+
33
+ Kwargs:
34
+ original_source (str, optional): The original source of the document.
35
+
36
+ Returns:
37
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
38
+ """
@@ -0,0 +1,33 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
5
+ from typing import Any
6
+
7
+ class PDFMinerWordLoader(BaseLoader):
8
+ """PDFMinerWordLoader is used to extract the TEXT from the PDF document.
9
+
10
+ This class defines the structure for loading PDF documents using PDFMiner per word.
11
+ It implements the 'load' method to extract PDF information from a given file path.
12
+
13
+ PDFMinerWordLoader is used to extract the TEXT from the PDF document.
14
+
15
+ Methods:
16
+ load(source, loaded_elements, **kwargs): Load a PDF document.
17
+ """
18
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
19
+ """Load a PDF document.
20
+
21
+ This method loads a PDF document from a given file path.
22
+
23
+ Args:
24
+ source (str): The file path of the PDF document.
25
+ loaded_elements (list[dict[str, Any]]): The loaded elements.
26
+ **kwargs (Any): Additional keyword arguments.
27
+
28
+ Kwargs:
29
+ original_source (str, optional): The original source of the document.
30
+
31
+ Returns:
32
+ list[dict[str, Any]]: The loaded elements.
33
+ """
@@ -0,0 +1,38 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import bbox_to_coordinates as bbox_to_coordinates, merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from pdfplumber._typing import T_obj as T_obj
7
+ from pdfplumber.page import Page as Page
8
+ from pdfplumber.table import Table as Table
9
+ from typing import Any
10
+
11
+ class PDFPlumberLoader(BaseLoader):
12
+ """A class for loading and processing PDF document using PDFPlumberLoader.
13
+
14
+ This class defines the structure for loading and processing PDF document to retrieve required values
15
+ (table and metadata). It implements the 'load' method to handle PDF loading from a given file path.
16
+
17
+ PDFPlumberLoader is used to extract the TABLE and metadata from the PDF document.
18
+
19
+ Methods:
20
+ load(source, loaded_elements, **kwargs): Load a PDF document.
21
+ """
22
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
23
+ """Load and process PDF document specified by the file path and name (source).
24
+
25
+ This method defines the process of loading and extracting table information from
26
+ PDF document using PDF Plumber library using its file path and name.
27
+
28
+ Args:
29
+ source (str): The path to the PDF document file.
30
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
31
+ **kwargs (Any): Additional keyword arguments for the loader.
32
+
33
+ Kwargs:
34
+ original_source (str, optional): The original source of the document.
35
+
36
+ Returns:
37
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
38
+ """
@@ -0,0 +1,43 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, extract_image_element as extract_image_element, find_related_link as find_related_link
4
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from typing import Any
7
+
8
+ class PyMuPDFLoader(BaseLoader):
9
+ """A class for loading and processing PDF document using PyMuPDF.
10
+
11
+ This class defines the structure for loading and processing PDF document to retrieve required values
12
+ (text and image in base64 format). It implements the 'load' method to handle PDF loading from a given file path.
13
+
14
+ PyMuPDFLoader is used to extract the TEXT and IMAGE in base64 format from the PDF document.
15
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
16
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
17
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
18
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
19
+
20
+ Methods:
21
+ load(source, loaded_elements, **kwargs): Load a PDF document.
22
+ """
23
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
24
+ """Load and process a PDF document specified by the file path and name (source).
25
+
26
+ This method defines the process of loading a PDF document using its file path.
27
+ It uses PyMuPDF to extract element text and element image from the PDF document.
28
+
29
+ Args:
30
+ source (str): The path to the PDF document file.
31
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
32
+ **kwargs (Any): Additional keyword arguments for the loader.
33
+
34
+ Kwargs:
35
+ original_source (str, optional): The original source of the document.
36
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
37
+ markdown format. Defaults to True.
38
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
39
+ Defaults to None. Means no sorting will be done.
40
+
41
+ Returns:
42
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
43
+ """
@@ -0,0 +1,44 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, extract_image_element as extract_image_element, find_related_link as find_related_link
4
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from typing import Any
7
+
8
+ class PyMuPDFSpanLoader(BaseLoader):
9
+ """PyMuPDFSpanLoader class to extract text per span from a PDF file using PyMuPDF.
10
+
11
+ This class defines the structure for extracting text per span from a PDF file using PyMuPDF.
12
+ It implements the load method to extract information from a PDF file from a given source.
13
+
14
+ PyMuPDFLoader is used to extract the TEXT, HYPERLINK, and IMAGE in base64 format from the PDF document.
15
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
16
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
17
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
18
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
19
+
20
+ Methods:
21
+ load(source, loaded_elements, **kwargs): Load a PDF document.
22
+ """
23
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
24
+ """Load a PDF file using PyMuPDF and extract text span.
25
+
26
+ This method loads a PDF file using PyMuPDF and extracts text per span. It will return
27
+ a list of loaded elements. Span is a segment of text within a document, representing a
28
+ continuous sequence of characters with the same formatting (such as font, size, and color).
29
+
30
+ Args:
31
+ source (str): The path to the PDF file.
32
+ loaded_elements (list[dict[str, Any]] | None): A list of loaded elements. Defaults to None.
33
+ **kwargs (Any): Additional keyword arguments.
34
+
35
+ Kwargs:
36
+ original_source (str, optional): The original source of the document.
37
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
38
+ markdown format. Defaults to True.
39
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
40
+ Defaults to None. Means no sorting will be done.
41
+
42
+ Returns:
43
+ list[dict[str, Any]]: A list of loaded elements.
44
+ """
@@ -0,0 +1,34 @@
1
+ from gllm_docproc.loader.pdf.pdf_loader_utils import bbox_to_coordinates as bbox_to_coordinates
2
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE
3
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
4
+ from typing import Any
5
+
6
+ def extract_image_element(image_instance: dict[str, Any], page_idx: int, element_metadata: ElementMetadata, page_layout_width: int, page_layout_height: int) -> Element:
7
+ """Extract value (image in base64 format and other metadata) from image element.
8
+
9
+ This method defines the process of extracting Image value in base64 format from image element.
10
+
11
+ Args:
12
+ image_instance (dict): The image instance.
13
+ page_idx (int): The number of the page index.
14
+ element_metadata (ElementMetadata): The element metadata.
15
+ page_layout_width (int): The width of the page layout.
16
+ page_layout_height (int): The height of the page layout.
17
+
18
+ Returns:
19
+ Element: An Element object containing image in base64 format and metadata.
20
+ """
21
+ def find_related_link(text_rect: list[float], links: list[dict[str, Any]]) -> dict[str, Any] | None:
22
+ """Find the related link for a text rectangle.
23
+
24
+ This method finds the related link for a text rectangle. It will return the link if the text
25
+ rectangle intersects with the link rectangle.
26
+
27
+ Args:
28
+ text_rect (list[float]): The text rectangle.
29
+ links (list[dict[str, Any]]): A list of links.
30
+
31
+ Returns:
32
+ dict[str, Any] | None: The related link if the text rectangle intersects with the link rectangle
33
+ or None if the text rectangle does not intersect with the link rectangle.
34
+ """
@@ -0,0 +1,32 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from typing import Any
7
+
8
+ class TabulaLoader(BaseLoader):
9
+ """A class for loading PDF and extracting Table from PDF using Tabula.
10
+
11
+ This class defines the structure for loading PDF and extracting Table from PDF using Tabula.
12
+ It implements the 'load' method to handle the loading and extraction process.
13
+
14
+ TabulaLoader is used to extract the TABLE and metadata from the PDF document.
15
+
16
+ Methods:
17
+ load(source, loaded_elements, **kwargs): Load the PDF file and extract Table from PDF using Tabula.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load the PDF file and extract Table from PDF using Tabula.
21
+
22
+ Args:
23
+ source (str): The file path of the PDF document.
24
+ loaded_elements (list[dict[str, Any]]): A list of loaded elements from the PDF document.
25
+ **kwargs (Any): Additional keyword arguments.
26
+
27
+ Kwargs:
28
+ original_source (str, optional): The original source of the document.
29
+
30
+ Returns:
31
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
32
+ """
@@ -0,0 +1,37 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.loader.pdf.pdf_miner_word_loader import PDFMinerWordLoader as PDFMinerWordLoader
5
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class TextInjectPDFPlumberLoader(BaseLoader):
10
+ """A class for loading PDF documents using PDFPlumber by injecting text into tables.
11
+
12
+ This class defines the structure for loading PDF documents using PDFPlumber by injecting text into tables.
13
+ It implements the 'load' method to handle PDF loading from a given file path.
14
+
15
+ TextInjectPDFPlumberLoader is used to extract the TABLE from the PDF document.
16
+
17
+ Methods:
18
+ load(source, loaded_elements, **kwargs): Load a PDF document.
19
+ """
20
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
21
+ """Load a PDF document.
22
+
23
+ This method loads a PDF document from a given file path.
24
+
25
+ Args:
26
+ source (str): The file path of the PDF document.
27
+ loaded_elements (list[dict[str, Any]]): The loaded elements.
28
+ kwargs (Any): Additional keyword arguments.
29
+
30
+ Kwargs:
31
+ original_source (str, optional): The original source of the document.
32
+ font_size_threshold (int, optional): The font size threshold. Defaults to None.
33
+ When None, the font size threshold will be most frequent font size multiplied by 2.
34
+
35
+ Returns:
36
+ list[dict[str, Any]]: The loaded elements.
37
+ """
@@ -0,0 +1,48 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_datastore.cache_data_store.cache_data_store import BaseCacheDataStore as BaseCacheDataStore
3
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
4
+ from typing import Any
5
+
6
+ class PipelineLoader:
7
+ """A pipeline loader for loading documents.
8
+
9
+ This class serves as the pipeline loader for loading document. It defines the structure for
10
+ loading document with several loaders using pipeline.
11
+
12
+ Methods:
13
+ add_loader(loader): Add loader to the pipeline loader.
14
+ load(source, **kwargs): Load the document from the given source.
15
+ """
16
+ loaders: list[BaseLoader]
17
+ cache_data_store: Incomplete
18
+ logger: Incomplete
19
+ def __init__(self, cache_data_store: BaseCacheDataStore | None = None) -> None:
20
+ """Initialize the PipelineLoader.
21
+
22
+ Args:
23
+ cache_data_store (BaseCacheDataStore, optional): The cache data store to be used.
24
+ Defaults to None.
25
+ """
26
+ def add_loader(self, loader: BaseLoader):
27
+ """Add loader to the pipeline loader.
28
+
29
+ This method defines the process of adding loader to the pipeline loader.
30
+
31
+ Args:
32
+ loader (BaseLoader): The loader to be added.
33
+ """
34
+ def load(self, source: str, **kwargs: Any) -> list[dict[str, Any]]:
35
+ """Load the document from the given file path.
36
+
37
+ This method defines the process of loading the document using loaders.
38
+
39
+ Args:
40
+ source (str): Might be file path, URL, the content itself.
41
+ **kwargs (Any): Additional keyword arguments.
42
+
43
+ Kwargs:
44
+ ttl (int, optional): The TTL of the cache. Defaults to None.
45
+
46
+ Returns:
47
+ List[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
48
+ """
@@ -0,0 +1,3 @@
1
+ from .txt_loader import TXTLoader as TXTLoader
2
+
3
+ __all__ = ['TXTLoader']
@@ -0,0 +1,26 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import TXT as TXT
5
+ from typing import Any
6
+
7
+ class TXTLoader(BaseLoader):
8
+ """A class for loading text files (.txt) into a list of elements.
9
+
10
+ Methods:
11
+ load: Load a text file into a list of elements.
12
+ """
13
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
14
+ """Load a text file into a list of elements.
15
+
16
+ Args:
17
+ source (str): The path to the text file.
18
+ loaded_elements (list[dict[str, Any]]): The list of elements that have already been loaded.
19
+ **kwargs: Additional keyword arguments.
20
+
21
+ Kwargs:
22
+ original_source (str, optional): The original source of the document.
23
+
24
+ Returns:
25
+ list[dict[str, Any]]: A list of elements.
26
+ """
@@ -0,0 +1,3 @@
1
+ from .openpyxl_loader import OpenpyxlLoader as OpenpyxlLoader
2
+
3
+ __all__ = ['OpenpyxlLoader']
@@ -0,0 +1,37 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, XLSX as XLSX
5
+ from openpyxl.cell.cell import MergedCell as MergedCell
6
+ from typing import Any
7
+
8
+ class OpenpyxlLoader(BaseLoader):
9
+ """A class used to load and process XLSX documents using the openpyxl library.
10
+
11
+ This class inherits from the BaseLoader class and overrides its methods to provide
12
+ functionality for loading XLSX documents. It provides methods to extract tables from
13
+ the document, determine whether a row is a header based on its style attributes, and
14
+ split a table into headers and body based on the row styles and header threshold.
15
+
16
+ Methods:
17
+ load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any)
18
+ -> list[dict[str, Any]]: Load a XLSX document.
19
+ """
20
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
21
+ """Load a XLSX document.
22
+
23
+ This method loads a XLSX document and extracts the table elements from each sheet.
24
+ The method takes the source file path as input and returns a list of Element objects
25
+ representing the tables in the document.
26
+
27
+ Args:
28
+ source (str): The path to the XLSX document to load.
29
+ loaded_elements (list[dict[str, Any]]): The loaded elements from previous loaders.
30
+ **kwargs (Any): Additional keyword arguments to pass to the loader.
31
+
32
+ Kwargs:
33
+ original_source (str, optional): The original source of the document.
34
+
35
+ Returns:
36
+ list[dict[str, Any]]: The loaded elements from the XLSX document.
37
+ """
@@ -0,0 +1,4 @@
1
+ from .element import Element as Element
2
+ from .element_metadata import ElementMetadata as ElementMetadata
3
+
4
+ __all__ = ['Element', 'ElementMetadata']
@@ -0,0 +1,37 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
3
+ from pydantic import BaseModel
4
+ from typing import Any
5
+
6
+ UNCATEGORIZED_TEXT: str
7
+ HEADER: str
8
+ TITLE: str
9
+ HEADING: Incomplete
10
+ PARAGRAPH: str
11
+ TABLE: str
12
+ AUDIO: str
13
+ IMAGE: str
14
+ VIDEO: str
15
+ FOOTER: str
16
+ FOOTNOTE: str
17
+ MAX_HEADING_LEVEL: int
18
+
19
+ class Element(BaseModel):
20
+ """An Element model.
21
+
22
+ This class serves as the Element model for storing element text, structure, and metadata.
23
+
24
+ Attributes:
25
+ text (str): The element text.
26
+ structure (str): The element structure.
27
+ metadata (dict): The element metadata.
28
+ """
29
+ text: str
30
+ structure: str
31
+ metadata: ElementMetadata
32
+ @staticmethod
33
+ def to_list_dict(elements: list['Element']) -> list[dict[str, Any]]:
34
+ """Convert a list of Element objects to a list of dictionaries."""
35
+ @staticmethod
36
+ def from_list_dict(elements: list[dict[str, Any]]) -> list['Element']:
37
+ """Convert a list of dictionaries to a list of Element objects."""