gllm-docproc-binary 0.7.22__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (167) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +6 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  28. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  29. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  38. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  39. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  40. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  41. gllm_docproc/dpo_router/__init__.pyi +5 -0
  42. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  43. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  44. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  45. gllm_docproc/housekeeping/__init__.pyi +3 -0
  46. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  47. gllm_docproc/indexer/__init__.pyi +3 -0
  48. gllm_docproc/indexer/base_indexer.pyi +30 -0
  49. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  50. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  51. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  52. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  53. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  54. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  55. gllm_docproc/loader/__init__.pyi +4 -0
  56. gllm_docproc/loader/audio/__init__.pyi +3 -0
  57. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  58. gllm_docproc/loader/base_loader.pyi +30 -0
  59. gllm_docproc/loader/csv/__init__.pyi +3 -0
  60. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  61. gllm_docproc/loader/docx/__init__.pyi +5 -0
  62. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  63. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  64. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  65. gllm_docproc/loader/exception/__init__.pyi +4 -0
  66. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  67. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  68. gllm_docproc/loader/html/__init__.pyi +5 -0
  69. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  70. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  71. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  72. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
  73. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  74. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  75. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  76. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  77. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  78. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  79. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  80. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  81. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  82. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  83. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  84. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  85. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  86. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  87. gllm_docproc/loader/image/__init__.pyi +3 -0
  88. gllm_docproc/loader/image/image_loader.pyi +54 -0
  89. gllm_docproc/loader/json/__init__.pyi +3 -0
  90. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  91. gllm_docproc/loader/loader_utils.pyi +43 -0
  92. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  93. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  94. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  96. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  97. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  98. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  99. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  100. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  101. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  102. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  103. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  104. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  105. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  106. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  107. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  108. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  109. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  110. gllm_docproc/loader/txt/__init__.pyi +3 -0
  111. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  112. gllm_docproc/loader/video/__init__.pyi +3 -0
  113. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  114. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  115. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  116. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  117. gllm_docproc/model/__init__.pyi +7 -0
  118. gllm_docproc/model/element.pyi +38 -0
  119. gllm_docproc/model/element_metadata.pyi +35 -0
  120. gllm_docproc/model/loader_type.pyi +20 -0
  121. gllm_docproc/model/media.pyi +51 -0
  122. gllm_docproc/model/parser_type.pyi +19 -0
  123. gllm_docproc/parser/__init__.pyi +4 -0
  124. gllm_docproc/parser/base_parser.pyi +28 -0
  125. gllm_docproc/parser/document/__init__.pyi +7 -0
  126. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  127. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  128. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  129. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  130. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  131. gllm_docproc/parser/html/__init__.pyi +4 -0
  132. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  133. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  134. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  135. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  136. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  137. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  138. gllm_docproc/parser/image/__init__.pyi +4 -0
  139. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  140. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  141. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  142. gllm_docproc/parser/table/__init__.pyi +3 -0
  143. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  144. gllm_docproc/request_handler/__init__.pyi +3 -0
  145. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  146. gllm_docproc/response_handler/__init__.pyi +3 -0
  147. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  148. gllm_docproc/utils/__init__.pyi +3 -0
  149. gllm_docproc/utils/async_utils.pyi +22 -0
  150. gllm_docproc/utils/file_utils.pyi +76 -0
  151. gllm_docproc/utils/html_constants.pyi +122 -0
  152. gllm_docproc/validator/__init__.pyi +6 -0
  153. gllm_docproc/validator/base_validator.pyi +34 -0
  154. gllm_docproc/validator/character_count_validator.pyi +26 -0
  155. gllm_docproc/validator/file_size_validator.pyi +20 -0
  156. gllm_docproc/validator/model/__init__.pyi +4 -0
  157. gllm_docproc/validator/model/validator_input.pyi +50 -0
  158. gllm_docproc/validator/model/validator_result.pyi +19 -0
  159. gllm_docproc/validator/page_count_validator.pyi +23 -0
  160. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  161. gllm_docproc.build/.gitignore +1 -0
  162. gllm_docproc.cp311-win_amd64.pyd +0 -0
  163. gllm_docproc.pyi +220 -0
  164. gllm_docproc_binary-0.7.22.dist-info/METADATA +216 -0
  165. gllm_docproc_binary-0.7.22.dist-info/RECORD +167 -0
  166. gllm_docproc_binary-0.7.22.dist-info/WHEEL +5 -0
  167. gllm_docproc_binary-0.7.22.dist-info/top_level.txt +1 -0
@@ -0,0 +1,54 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata
4
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
6
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
7
+ from typing import Any
8
+
9
+ class ImageLoader(BaseLoader):
10
+ """A class for loading standalone image files.
11
+
12
+ This class defines the structure for loading standalone image files.
13
+ It supports all image formats by validating that the MIME type starts with 'image/'.
14
+
15
+ Methods:
16
+ load: Load an image file into a list of elements.
17
+ is_image_file: Check if a file is a valid image file based on the mime type.
18
+ """
19
+ logger: Incomplete
20
+ def __init__(self) -> None:
21
+ """Initialize the ImageLoader class."""
22
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
23
+ """Load an image file into a list of elements.
24
+
25
+ This method loads an image file and creates Element objects with media metadata containing the image data.
26
+ For multi-page images, each page/frame is extracted as a separate element.
27
+
28
+ Args:
29
+ source (str): The path to the image file.
30
+ loaded_elements (list[dict[str, Any]] | None, optional): A list of elements that have already been loaded.
31
+ **kwargs (Any): Additional keyword arguments for the loader.
32
+
33
+ Kwargs:
34
+ original_source (str, optional): The original source of the document.
35
+
36
+ Returns:
37
+ list[dict[str, Any]]: A list of elements containing the image data.
38
+
39
+ Raises:
40
+ FileNotFoundError: If the file does not exist.
41
+ ValueError: If the file is not a valid image file.
42
+ """
43
+ def is_image_file(self, source: str) -> bool:
44
+ """Check if a file is a valid image file.
45
+
46
+ This method uses the `magic` library to check if a file is a valid image file
47
+ by examining its MIME type. Any file with a MIME type starting with 'image/' is considered valid.
48
+
49
+ Args:
50
+ source (str): The path to the file.
51
+
52
+ Returns:
53
+ bool: True if the file is a valid image file, False otherwise.
54
+ """
@@ -0,0 +1,3 @@
1
+ from .json_elements_loader import JSONElementsLoader as JSONElementsLoader
2
+
3
+ __all__ = ['JSONElementsLoader']
@@ -0,0 +1,35 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element
4
+ from typing import Any
5
+
6
+ JSON: str
7
+
8
+ class JSONElementsLoader(BaseLoader):
9
+ """JSON Elements Loader class.
10
+
11
+ This class provides a loader for extracting information from JSON files.
12
+ The JSON file must be in the format of list of dictionaries. where each dictionary
13
+ must be following the structure of Element class.
14
+
15
+ Methods:
16
+ load(source, element_metadata, **kwargs): Load and process a document.
17
+ """
18
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> list[dict[str, Any]]:
19
+ """Load and process a document.
20
+
21
+ This method loads the JSON file and returns the list of elements. If file id is provided,
22
+ the file id will be added to the element metadata and the chunk id and chunk relation metadata
23
+ that contains file id as prefix will be updated.
24
+
25
+ Args:
26
+ source (str): The file path of the JSON file.
27
+ loaded_elements (Any, optional): The loaded elements. JSON Loader ignore this parameter.
28
+ **kwargs (Any): The keyword arguments.
29
+
30
+ Kwargs:
31
+ file_id (str, optional): The file id of for the elements. Defaults to None.
32
+
33
+ Returns:
34
+ list[dict[str, Any]]: The loaded elements.
35
+ """
@@ -0,0 +1,43 @@
1
+ from gllm_docproc.loader.exception import UnsupportedFileExtensionError as UnsupportedFileExtensionError
2
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
3
+
4
+ def validate_file_extension(expected_extensions: str | list[str], loader_name: str):
5
+ """Decorator to validate the file extension of the input file.
6
+
7
+ Args:
8
+ expected_extensions (str | list[str]): The expected file extension(s). Can be a single extension
9
+ as a string or a list of valid extensions. Extensions are case-insensitive.
10
+ loader_name (str): The name of the loader.
11
+
12
+ Returns:
13
+ Callable[[Callable[..., Any]], Callable[..., Any]]: A decorator that wraps the original
14
+ function to validate the file extension.
15
+
16
+ Raises:
17
+ UnsupportedFileExtensionError: If the file extension does not match the expected extension.
18
+ """
19
+ def create_base_element_metadata(source: str, source_type: str) -> ElementMetadata:
20
+ """Create the base element metadata.
21
+
22
+ This function creates the base element metadata for the loaded element. Base element metadata
23
+ includes the source, source type, and loaded datetime.
24
+
25
+ Args:
26
+ source (str): The source of the element.
27
+ source_type (str): The source type.
28
+
29
+ Returns:
30
+ ElementMetadata: The base element metadata.
31
+ """
32
+ def trim_table_empty_cells(table: list[list[str]]) -> list[list[str]]:
33
+ """Trim the empty cells in the table.
34
+
35
+ This function trims the empty cells in the table by removing the empty cells at the end of each
36
+ row. The function also ensures that all rows have the same number of columns.
37
+
38
+ Args:
39
+ table (List[List[str]]): A list of lists containing the table content.
40
+
41
+ Returns:
42
+ List[List[str]]: A list of lists containing the trimmed table content.
43
+ """
@@ -0,0 +1,14 @@
1
+ from .adobe_pdf_extract_loader import AdobePDFExtractLoader as AdobePDFExtractLoader
2
+ from .azure_ai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader as AzureAIDocumentIntelligenceLoader
3
+ from .azure_ai_document_intelligence_raw_loader import AzureAIDocumentIntelligenceRawLoader as AzureAIDocumentIntelligenceRawLoader
4
+ from .glair_vision_ocr_loader import GLAIRVisionOCRLoader as GLAIRVisionOCRLoader
5
+ from .pdf_miner_loader import PDFMinerLoader as PDFMinerLoader
6
+ from .pdf_miner_word_loader import PDFMinerWordLoader as PDFMinerWordLoader
7
+ from .pdf_page_loader import PDFPageLoader as PDFPageLoader
8
+ from .pdf_plumber_loader import PDFPlumberLoader as PDFPlumberLoader
9
+ from .pymupdf_loader import PyMuPDFLoader as PyMuPDFLoader
10
+ from .pymupdf_span_loader import PyMuPDFSpanLoader as PyMuPDFSpanLoader
11
+ from .tabula_loader import TabulaLoader as TabulaLoader
12
+ from .text_inject_pdf_plumber_loader import TextInjectPDFPlumberLoader as TextInjectPDFPlumberLoader
13
+
14
+ __all__ = ['AdobePDFExtractLoader', 'AzureAIDocumentIntelligenceLoader', 'AzureAIDocumentIntelligenceRawLoader', 'GLAIRVisionOCRLoader', 'PDFMinerLoader', 'PDFPageLoader', 'PDFMinerWordLoader', 'PDFPlumberLoader', 'PyMuPDFLoader', 'PyMuPDFSpanLoader', 'TabulaLoader', 'TextInjectPDFPlumberLoader']
@@ -0,0 +1,37 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
4
+ from typing import Any
5
+
6
+ class AdobePDFExtractLoader(BaseLoader):
7
+ """Adobe PDF Extract Loader class.
8
+
9
+ This class provides a loader for extracting information from PDF files using Adobe PDF Extract.
10
+ It implements the 'load' method to load PDF files and extract information.
11
+
12
+ Methods:
13
+ load(source, loaded_elements, **kwargs): Loads a PDF file and extracts information.
14
+ """
15
+ credentials: Incomplete
16
+ def __init__(self, client_id: str, client_secret: str) -> None:
17
+ """Initializes the Adobe PDF Extract Loader.
18
+
19
+ Args:
20
+ client_id (str): The client ID for the Adobe PDF Extract API.
21
+ client_secret (str): The client secret for the Adobe PDF Extract
22
+ """
23
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
24
+ """Loads a PDF file and extracts information using Adobe PDF Extract.
25
+
26
+ This method loads a PDF file and extracts information from the file using Adobe PDF Extract.
27
+ The extracted information is returned as a dictionary. The extracted information includes text,
28
+ tables, and other elements from the PDF file.
29
+
30
+ Args:
31
+ source (str): The source PDF file to load and extract information from.
32
+ loaded_elements (Any): A list of loaded elements to be processed.
33
+ **kwargs (Any): Additional keyword arguments.
34
+
35
+ Returns:
36
+ dict[str, Any]: The extracted information as a dictionary
37
+ """
@@ -0,0 +1,47 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.azure_ai_document_intelligence_raw_loader import AzureAIDocumentIntelligenceRawLoader as AzureAIDocumentIntelligenceRawLoader
5
+ from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
6
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
7
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, IMAGE as IMAGE, PDF as PDF
8
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
9
+ from typing import Any
10
+
11
+ class AzureAIDocumentIntelligenceLoader(BaseLoader):
12
+ """Azure AI Document Intelligence Loader class.
13
+
14
+ This class provides a loader for extracting text, tables, and images from PDF files
15
+ using the Azure AI Document Intelligence API. It implements the 'load' method to handle document
16
+ loading from a given source.
17
+
18
+ Methods:
19
+ load(source, loaded_elements, **kwargs): Load and process a document.
20
+ """
21
+ INCH_TO_POINT: int
22
+ endpoint: Incomplete
23
+ key: Incomplete
24
+ logger: Incomplete
25
+ def __init__(self, endpoint: str, key: str) -> None:
26
+ """Initializes the Azure AI Document Intelligence Loader class.
27
+
28
+ Args:
29
+ endpoint (str): The endpoint for the Azure AI Document Intelligence API.
30
+ key (str): The key for the Azure AI Document Intelligence API.
31
+ """
32
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
33
+ """Load and process a document using the Azure AI Document Intelligence API.
34
+
35
+ Args:
36
+ source (str): The source of the document to be processed.
37
+ loaded_elements (list[dict[str, Any]], optional): A list of dictionaries containing loaded content and
38
+ metadata. Defaults to None.
39
+ **kwargs (Any): Additional keyword arguments for the loader.
40
+
41
+ Kwargs:
42
+ raw_output (dict[str, Any], optional): The raw output from the Azure AI Document Intelligence Raw Loader.
43
+ original_source (str, optional): The original source of the document.
44
+
45
+ Returns:
46
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
47
+ """
@@ -0,0 +1,49 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
4
+ from typing import Any
5
+
6
+ class AzureAIDocumentIntelligenceRawLoader(BaseLoader):
7
+ """Azure AI Document Intelligence Raw Loader class.
8
+
9
+ This class provides a loader for extracting text, tables, and images from PDF files
10
+ using the Azure AI Document Intelligence API. It implements the 'load' method to handle document
11
+ loading from a given source.
12
+
13
+ Methods:
14
+ load(source, loaded_elements, **kwargs): Load and process a document.
15
+ """
16
+ endpoint: Incomplete
17
+ key: Incomplete
18
+ def __init__(self, endpoint: str, key: str) -> None:
19
+ """Initializes the AzureAILoader class.
20
+
21
+ Args:
22
+ endpoint (str): The endpoint for the Azure AI Document Intelligence API.
23
+ key (str): The key for the Azure AI Document Intelligence API.
24
+ """
25
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
26
+ '''Load and process a document using the Azure AI Document Intelligence API.
27
+
28
+ This method sends a request to the Azure AI Document Intelligence API to extract information
29
+ from a PDF file. It returns the extracted information in a dictionary format, without any
30
+ additional processing.
31
+
32
+ Kwargs:
33
+ model_id (str, optional): The model used for document analysis. Azure AI Document Intelligence API
34
+ provides several prebuilt models for document analysis, check for available models at:
35
+ https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview?
36
+ view=doc-intel-4.0.0#model-analysis-features
37
+ Defaults to "prebuilt-layout".
38
+ features (list[str], optional): The add-on capabilities. Document Intelligence supports more sophisticated
39
+ analysis capabilities. These optional features can be enabled and disabled depending on the scenario
40
+ of the document extraction. Check for available add on capabilities at:
41
+ https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?
42
+ view=azure-python-preview#add-on-capabilities
43
+ Defaults to an empty list.
44
+
45
+ Args:
46
+ source (str): The source of the document to be processed.
47
+ loaded_elements (Any): A list of loaded elements to be processed.
48
+ **kwargs (Any): Additional keyword arguments.
49
+ '''
@@ -0,0 +1,38 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import validate_file_extension as validate_file_extension
4
+ from typing import Any
5
+
6
+ class GLAIRVisionOCRLoader(BaseLoader):
7
+ """GLAIR Vision OCR Loader class.
8
+
9
+ This class provides a loader for extracting text and table from PDF file using the GLAIR Vision OCR API.
10
+ It implements the 'load' method to handle document loading from a given source.
11
+
12
+ Methods:
13
+ load(source, loaded_elements, **kwargs): Load and process a document.
14
+ """
15
+ username: Incomplete
16
+ password: Incomplete
17
+ api_key: Incomplete
18
+ def __init__(self, username: str, password: str, api_key: str) -> None:
19
+ """Initializes the GLAIRVisionOCRLoader class.
20
+
21
+ Args:
22
+ username (str): The username for the GLAIR Vision OCR API.
23
+ password (str): The password for the GLAIR Vision OCR API.
24
+ api_key (str): The API key for the GLAIR Vision OCR API.
25
+ """
26
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> dict[str, Any]:
27
+ """Load and process a document using the GLAIR Vision OCR API.
28
+
29
+ This method loads a PDF document from a given source and extracts text and table using the GLAIR Vision OCR API.
30
+
31
+ Args:
32
+ source (str): The source of the document to be processed.
33
+ loaded_elements (Any): The loaded elements from previous loaders.
34
+ **kwargs (Any): Additional keyword arguments for customization.
35
+
36
+ Returns:
37
+ dict: The OCR response from the GLAIR Vision OCR API.
38
+ """
@@ -0,0 +1,59 @@
1
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
2
+ from typing import Any
3
+
4
+ def merge_loaded_elements_by_coordinates(loaded_elements: list[Element], existing_loaded_elements: list[Element], **kwargs: Any) -> list[Element]:
5
+ """Merge the loaded elements by coordinates.
6
+
7
+ This function merges elements from 'loaded_elements' into 'existing_loaded_elements' based on
8
+ coordinates. The 'loaded_elements' inside the 'existing_loaded_elements' (eg. table) will be
9
+ duplicated information and will not be included in the merged list.
10
+
11
+ Args:
12
+ loaded_elements (List[Element]): A list of Elements containing loaded element content.
13
+ existing_loaded_elements (List[Element]): A list of existing Elements.
14
+ kwargs (Any): Additional keyword arguments for merging the loaded elements.
15
+
16
+ Kwargs:
17
+ is_object_inside_box_threshold (float, optional): The threshold of the intersection area to the area
18
+ of the object. Defaults to 1.
19
+ merge_element_with_duplicates (Callable[[Element, List[Element]], Element], optional): The function
20
+ to merge the new element with the duplicate elements. Defaults to _merge_element_with_duplicates.
21
+
22
+ Returns:
23
+ list[Element]: A list of Element containing merged loaded element content.
24
+ """
25
+ def bbox_to_coordinates(bbox: list[float]) -> list[int]:
26
+ """Convert the bounding box to coordinates.
27
+
28
+ This method converts the bounding box to coordinates.
29
+
30
+ Args:
31
+ bbox (list[float]): The bounding box.
32
+
33
+ Returns:
34
+ list[int]: The coordinates.
35
+ """
36
+ def is_object_inside_box(object_coordinates: list[int], box_coordinates: list[int], threshold: float = 1) -> bool:
37
+ """Validate is object coordinates position inside the box.
38
+
39
+ Args:
40
+ object_coordinates (list[int]): The coordinates position of the object.
41
+ box_coordinates (list[int]): The coordinates position of the box.
42
+ threshold (float): The threshold of the intersection area to the area of the object.
43
+
44
+ Returns:
45
+ bool: True if the object coordinates position inside the box.
46
+ """
47
+ def calculate_object_intersection_over_box_area(object_coordinates: list[int], box_coordinates: list[int]) -> float:
48
+ """Calculate the ratio of the intersection area of an object to the area of a bounding box.
49
+
50
+ This function computes the area of intersection between the given object coordinates and box coordinates,
51
+ and then calculates the ratio of this intersection area to the area of the object.
52
+
53
+ Args:
54
+ object_coordinates (list[int]): The coordinates of the object in the format [left, right, bottom, top].
55
+ box_coordinates (list[int]): The coordinates of the bounding box in the format [left, right, bottom, top].
56
+
57
+ Returns:
58
+ float: The ratio of the intersection area to the area of the object. Returns 0 if there is no intersection.
59
+ """
@@ -0,0 +1,38 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
5
+ from typing import Any
6
+
7
+ class PDFMinerLoader(BaseLoader):
8
+ """A class for loading and processing PDF document using PDFMiner.
9
+
10
+ This class defines the structure for loading and processing PDF document to retrieve required values
11
+ (text and metadata). It implements the 'load' method to handle PDF loading from a given file path.
12
+
13
+ PDFMinerLoader is used to extract the TEXT and metadata from the PDF document.
14
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
15
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
16
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
17
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
18
+
19
+ Methods:
20
+ load(source, loaded_elements, **kwargs): Load a PDF document.
21
+ """
22
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
23
+ """Load and process a PDF document specified by the file path and name (source).
24
+
25
+ This method defines the process of loading a PDF document using its file path.
26
+ It uses PDFMiner to extract element text and element metadata from the PDF document.
27
+
28
+ Args:
29
+ source (str): The path to the PDF document file.
30
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
31
+ **kwargs (Any): Additional keyword arguments for the loader.
32
+
33
+ Kwargs:
34
+ original_source (str, optional): The original source of the document.
35
+
36
+ Returns:
37
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
38
+ """
@@ -0,0 +1,33 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
5
+ from typing import Any
6
+
7
+ class PDFMinerWordLoader(BaseLoader):
8
+ """PDFMinerWordLoader is used to extract the TEXT from the PDF document.
9
+
10
+ This class defines the structure for loading PDF documents using PDFMiner per word.
11
+ It implements the 'load' method to extract PDF information from a given file path.
12
+
13
+ PDFMinerWordLoader is used to extract the TEXT from the PDF document.
14
+
15
+ Methods:
16
+ load(source, loaded_elements, **kwargs): Load a PDF document.
17
+ """
18
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
19
+ """Load a PDF document.
20
+
21
+ This method loads a PDF document from a given file path.
22
+
23
+ Args:
24
+ source (str): The file path of the PDF document.
25
+ loaded_elements (list[dict[str, Any]]): The loaded elements.
26
+ **kwargs (Any): Additional keyword arguments.
27
+
28
+ Kwargs:
29
+ original_source (str, optional): The original source of the document.
30
+
31
+ Returns:
32
+ list[dict[str, Any]]: The loaded elements.
33
+ """
@@ -0,0 +1,41 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.pymupdf_utils import convert_page_to_image_base64 as convert_page_to_image_base64, create_page_element as create_page_element
5
+ from gllm_docproc.model.element import Element as Element, PAGE as PAGE
6
+ from gllm_docproc.model.element_metadata import PDF as PDF
7
+ from typing import Any
8
+
9
+ class PDFPageLoader(BaseLoader):
10
+ """PDF Page Loader class.
11
+
12
+ This class defines the structure for loading PDF page level informations.
13
+ It implements the 'load' method to load the PDF file from a given file path.
14
+
15
+ Methods:
16
+ load(source, loaded_elements, **kwargs): Load a PDF document.
17
+ """
18
+ dpi: Incomplete
19
+ def __init__(self, dpi: int = 150) -> None:
20
+ """Initialize the PDF Page Loader.
21
+
22
+ Args:
23
+ dpi (int, optional): Rendering resolution for each PDF page in Dots Per Inch (DPI).
24
+ Higher values produce higher-quality images but increase memory usage.
25
+ Defaults to 150.
26
+ """
27
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
28
+ """Load and extract page level information from PDF document.
29
+
30
+ Args:
31
+ source (str): The file path to the PDF document.
32
+ loaded_elements (list[dict[str, Any]] | None, optional): Previously loaded elements from the same source.
33
+ If provided, new elements will be combined with existing ones. Defaults to None.
34
+ **kwargs (Any): Additional keyword arguments for PDF processing configuration.
35
+
36
+ Kwargs:
37
+ original_source (str, optional): The original source of the document.
38
+
39
+ Returns:
40
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
41
+ """
@@ -0,0 +1,35 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import bbox_to_coordinates as bbox_to_coordinates, merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from typing import Any
7
+
8
+ class PDFPlumberLoader(BaseLoader):
9
+ """A class for loading and processing PDF document using PDFPlumberLoader.
10
+
11
+ This class defines the structure for loading and processing PDF document to retrieve required values
12
+ (table and metadata). It implements the 'load' method to handle PDF loading from a given file path.
13
+
14
+ PDFPlumberLoader is used to extract the TABLE and metadata from the PDF document.
15
+
16
+ Methods:
17
+ load(source, loaded_elements, **kwargs): Load a PDF document.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load and process PDF document specified by the file path and name (source).
21
+
22
+ This method defines the process of loading and extracting table information from
23
+ PDF document using PDF Plumber library using its file path and name.
24
+
25
+ Args:
26
+ source (str): The path to the PDF document file.
27
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
28
+ **kwargs (Any): Additional keyword arguments for the loader.
29
+
30
+ Kwargs:
31
+ original_source (str, optional): The original source of the document.
32
+
33
+ Returns:
34
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
35
+ """
@@ -0,0 +1,55 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, convert_page_to_image_base64 as convert_page_to_image_base64, create_page_element as create_page_element, extract_image_element as extract_image_element, find_related_link as find_related_link
5
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class PyMuPDFLoader(BaseLoader):
10
+ """A class for loading and processing PDF document using PyMuPDF.
11
+
12
+ This class defines the structure for loading and processing PDF document to retrieve required values
13
+ (text and image in base64 format). It implements the 'load' method to handle PDF loading from a given file path.
14
+
15
+ PyMuPDFLoader is used to extract the TEXT and IMAGE in base64 format from the PDF document.
16
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
17
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
18
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
19
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
20
+
21
+ Methods:
22
+ load(source, loaded_elements, **kwargs): Load a PDF document.
23
+ """
24
+ fallback_to_image: Incomplete
25
+ page_dpi: Incomplete
26
+ def __init__(self, fallback_to_image: bool = True, page_dpi: int = 150) -> None:
27
+ """Initialize the PyMuPDF Loader.
28
+
29
+ Args:
30
+ fallback_to_image (bool, optional): A boolean to determine if the loader should fall back to
31
+ rendering the entire page as a base64-encoded image when no text or embedded images are found.
32
+ Defaults to True.
33
+ page_dpi (int, optional): The DPI of the page image. Defaults to 150.
34
+ """
35
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
36
+ """Load and process a PDF document specified by the file path and name (source).
37
+
38
+ This method defines the process of loading a PDF document using its file path.
39
+ It uses PyMuPDF to extract element text and element image from the PDF document.
40
+
41
+ Args:
42
+ source (str): The path to the PDF document file.
43
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
44
+ **kwargs (Any): Additional keyword arguments for the loader.
45
+
46
+ Kwargs:
47
+ original_source (str, optional): The original source of the document.
48
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
49
+ markdown format. Defaults to True.
50
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
51
+ Defaults to None. Means no sorting will be done.
52
+
53
+ Returns:
54
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
55
+ """
@@ -0,0 +1,56 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, convert_page_to_image_base64 as convert_page_to_image_base64, create_page_element as create_page_element, extract_image_element as extract_image_element, find_related_link as find_related_link
5
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class PyMuPDFSpanLoader(BaseLoader):
10
+ """PyMuPDFSpanLoader class to extract text per span from a PDF file using PyMuPDF.
11
+
12
+ This class defines the structure for extracting text per span from a PDF file using PyMuPDF.
13
+ It implements the load method to extract information from a PDF file from a given source.
14
+
15
+ PyMuPDFLoader is used to extract the TEXT, HYPERLINK, and IMAGE in base64 format from the PDF document.
16
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
17
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
18
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
19
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
20
+
21
+ Methods:
22
+ load(source, loaded_elements, **kwargs): Load a PDF document.
23
+ """
24
+ fallback_to_image: Incomplete
25
+ page_dpi: Incomplete
26
+ def __init__(self, fallback_to_image: bool = True, page_dpi: int = 150) -> None:
27
+ """Initialize the PyMuPDF Loader.
28
+
29
+ Args:
30
+ fallback_to_image (bool, optional): A boolean to determine if the loader should fall back to
31
+ rendering the entire page as a base64-encoded image when no text or embedded images are found.
32
+ Defaults to True.
33
+ page_dpi (int, optional): The DPI of the page image. Defaults to 150.
34
+ """
35
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
36
+ """Load a PDF file using PyMuPDF and extract text span.
37
+
38
+ This method loads a PDF file using PyMuPDF and extracts text per span. It will return
39
+ a list of loaded elements. Span is a segment of text within a document, representing a
40
+ continuous sequence of characters with the same formatting (such as font, size, and color).
41
+
42
+ Args:
43
+ source (str): The path to the PDF file.
44
+ loaded_elements (list[dict[str, Any]] | None): A list of loaded elements. Defaults to None.
45
+ **kwargs (Any): Additional keyword arguments.
46
+
47
+ Kwargs:
48
+ original_source (str, optional): The original source of the document.
49
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
50
+ markdown format. Defaults to True.
51
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
52
+ Defaults to None. Means no sorting will be done.
53
+
54
+ Returns:
55
+ list[dict[str, Any]]: A list of loaded elements.
56
+ """