gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (168) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +7 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
  28. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  29. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  38. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  39. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  40. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  41. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  42. gllm_docproc/dpo_router/__init__.pyi +5 -0
  43. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  44. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  45. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  46. gllm_docproc/housekeeping/__init__.pyi +3 -0
  47. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  48. gllm_docproc/indexer/__init__.pyi +3 -0
  49. gllm_docproc/indexer/base_indexer.pyi +30 -0
  50. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  51. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  52. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  53. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  54. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  55. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  56. gllm_docproc/loader/__init__.pyi +4 -0
  57. gllm_docproc/loader/audio/__init__.pyi +3 -0
  58. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  59. gllm_docproc/loader/base_loader.pyi +30 -0
  60. gllm_docproc/loader/csv/__init__.pyi +3 -0
  61. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  62. gllm_docproc/loader/docx/__init__.pyi +5 -0
  63. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  64. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  65. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  66. gllm_docproc/loader/exception/__init__.pyi +4 -0
  67. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  68. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  69. gllm_docproc/loader/html/__init__.pyi +5 -0
  70. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  71. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  72. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  73. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
  74. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  75. gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
  76. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  77. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  78. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  79. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  80. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  81. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  82. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  83. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  84. gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
  85. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  86. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  87. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  88. gllm_docproc/loader/image/__init__.pyi +3 -0
  89. gllm_docproc/loader/image/image_loader.pyi +54 -0
  90. gllm_docproc/loader/json/__init__.pyi +3 -0
  91. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  92. gllm_docproc/loader/loader_utils.pyi +43 -0
  93. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  94. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  96. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  97. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  98. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  99. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  100. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  101. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  102. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  103. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  104. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  105. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  106. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  107. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  108. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  109. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  110. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  111. gllm_docproc/loader/txt/__init__.pyi +3 -0
  112. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  113. gllm_docproc/loader/video/__init__.pyi +3 -0
  114. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  115. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  116. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  117. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  118. gllm_docproc/model/__init__.pyi +7 -0
  119. gllm_docproc/model/element.pyi +38 -0
  120. gllm_docproc/model/element_metadata.pyi +35 -0
  121. gllm_docproc/model/loader_type.pyi +20 -0
  122. gllm_docproc/model/media.pyi +51 -0
  123. gllm_docproc/model/parser_type.pyi +19 -0
  124. gllm_docproc/parser/__init__.pyi +4 -0
  125. gllm_docproc/parser/base_parser.pyi +28 -0
  126. gllm_docproc/parser/document/__init__.pyi +7 -0
  127. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  128. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  129. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  130. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  131. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  132. gllm_docproc/parser/html/__init__.pyi +4 -0
  133. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  134. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  135. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  136. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  137. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  138. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  139. gllm_docproc/parser/image/__init__.pyi +4 -0
  140. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  141. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  142. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  143. gllm_docproc/parser/table/__init__.pyi +3 -0
  144. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  145. gllm_docproc/request_handler/__init__.pyi +3 -0
  146. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  147. gllm_docproc/response_handler/__init__.pyi +3 -0
  148. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  149. gllm_docproc/utils/__init__.pyi +3 -0
  150. gllm_docproc/utils/async_utils.pyi +22 -0
  151. gllm_docproc/utils/file_utils.pyi +76 -0
  152. gllm_docproc/utils/html_constants.pyi +122 -0
  153. gllm_docproc/validator/__init__.pyi +6 -0
  154. gllm_docproc/validator/base_validator.pyi +34 -0
  155. gllm_docproc/validator/character_count_validator.pyi +26 -0
  156. gllm_docproc/validator/file_size_validator.pyi +20 -0
  157. gllm_docproc/validator/model/__init__.pyi +4 -0
  158. gllm_docproc/validator/model/validator_input.pyi +50 -0
  159. gllm_docproc/validator/model/validator_result.pyi +19 -0
  160. gllm_docproc/validator/page_count_validator.pyi +23 -0
  161. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  162. gllm_docproc.build/.gitignore +1 -0
  163. gllm_docproc.cpython-311-darwin.so +0 -0
  164. gllm_docproc.pyi +222 -0
  165. gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
  166. gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
  167. gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
  168. gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0
@@ -0,0 +1,55 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, convert_page_to_image_base64 as convert_page_to_image_base64, create_page_element as create_page_element, extract_image_element as extract_image_element, find_related_link as find_related_link
5
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class PyMuPDFLoader(BaseLoader):
10
+ """A class for loading and processing PDF document using PyMuPDF.
11
+
12
+ This class defines the structure for loading and processing PDF document to retrieve required values
13
+ (text and image in base64 format). It implements the 'load' method to handle PDF loading from a given file path.
14
+
15
+ PyMuPDFLoader is used to extract the TEXT and IMAGE in base64 format from the PDF document.
16
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
17
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
18
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
19
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
20
+
21
+ Methods:
22
+ load(source, loaded_elements, **kwargs): Load a PDF document.
23
+ """
24
+ fallback_to_image: Incomplete
25
+ page_dpi: Incomplete
26
+ def __init__(self, fallback_to_image: bool = True, page_dpi: int = 150) -> None:
27
+ """Initialize the PyMuPDF Loader.
28
+
29
+ Args:
30
+ fallback_to_image (bool, optional): A boolean to determine if the loader should fall back to
31
+ rendering the entire page as a base64-encoded image when no text or embedded images are found.
32
+ Defaults to True.
33
+ page_dpi (int, optional): The DPI of the page image. Defaults to 150.
34
+ """
35
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
36
+ """Load and process a PDF document specified by the file path and name (source).
37
+
38
+ This method defines the process of loading a PDF document using its file path.
39
+ It uses PyMuPDF to extract element text and element image from the PDF document.
40
+
41
+ Args:
42
+ source (str): The path to the PDF document file.
43
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
44
+ **kwargs (Any): Additional keyword arguments for the loader.
45
+
46
+ Kwargs:
47
+ original_source (str, optional): The original source of the document.
48
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
49
+ markdown format. Defaults to True.
50
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
51
+ Defaults to None. Means no sorting will be done.
52
+
53
+ Returns:
54
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
55
+ """
@@ -0,0 +1,56 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.loader.pdf.pymupdf_utils import bbox_to_coordinates as bbox_to_coordinates, convert_page_to_image_base64 as convert_page_to_image_base64, create_page_element as create_page_element, extract_image_element as extract_image_element, find_related_link as find_related_link
5
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class PyMuPDFSpanLoader(BaseLoader):
10
+ """PyMuPDFSpanLoader class to extract text per span from a PDF file using PyMuPDF.
11
+
12
+ This class defines the structure for extracting text per span from a PDF file using PyMuPDF.
13
+ It implements the load method to extract information from a PDF file from a given source.
14
+
15
+ PyMuPDFLoader is used to extract the TEXT, HYPERLINK, and IMAGE in base64 format from the PDF document.
16
+ Text loader have to be the first loader in the pipeline. This prioritization is because subsequent
17
+ loaders like the Table Loader may contain overlapping information with the Text Loader.
18
+ Therefore, these subsequent loaders rely on the output from the Text Loader. They merge the
19
+ loaded elements and filter out any duplicates by using the information provided by the Text Loader.
20
+
21
+ Methods:
22
+ load(source, loaded_elements, **kwargs): Load a PDF document.
23
+ """
24
+ fallback_to_image: Incomplete
25
+ page_dpi: Incomplete
26
+ def __init__(self, fallback_to_image: bool = True, page_dpi: int = 150) -> None:
27
+ """Initialize the PyMuPDF Loader.
28
+
29
+ Args:
30
+ fallback_to_image (bool, optional): A boolean to determine if the loader should fall back to
31
+ rendering the entire page as a base64-encoded image when no text or embedded images are found.
32
+ Defaults to True.
33
+ page_dpi (int, optional): The DPI of the page image. Defaults to 150.
34
+ """
35
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
36
+ """Load a PDF file using PyMuPDF and extract text span.
37
+
38
+ This method loads a PDF file using PyMuPDF and extracts text per span. It will return
39
+ a list of loaded elements. Span is a segment of text within a document, representing a
40
+ continuous sequence of characters with the same formatting (such as font, size, and color).
41
+
42
+ Args:
43
+ source (str): The path to the PDF file.
44
+ loaded_elements (list[dict[str, Any]] | None): A list of loaded elements. Defaults to None.
45
+ **kwargs (Any): Additional keyword arguments.
46
+
47
+ Kwargs:
48
+ original_source (str, optional): The original source of the document.
49
+ hyperlink_as_markdown (bool, optional): A boolean to determine if the hyperlink should be in
50
+ markdown format. Defaults to True.
51
+ sort_elements (Callable, optional): A callable function to sort the elements in every page.
52
+ Defaults to None. Means no sorting will be done.
53
+
54
+ Returns:
55
+ list[dict[str, Any]]: A list of loaded elements.
56
+ """
@@ -0,0 +1,77 @@
1
+ import fitz
2
+ from gllm_docproc.loader.pdf.pdf_loader_utils import bbox_to_coordinates as bbox_to_coordinates
3
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
5
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
6
+ from typing import Any
7
+
8
+ def fix_image_with_transform(image_bytes: bytes, transform: tuple) -> bytes:
9
+ """Apply the PDF affine transform (a,b,c,d,e,f) to an image using only the image's pixel grid.
10
+
11
+ Ignores page size and DPI. Picks an output pixel size that preserves detail by default.
12
+ Optionally pass px_per_point to control output density yourself.
13
+
14
+ Args:
15
+ image_bytes: The image data as bytes.
16
+ transform: The PDF affine transform tuple (a,b,c,d,e,f).
17
+
18
+ Returns:
19
+ PNG bytes of the transformed image.
20
+ """
21
+ def extract_image_element(image_instance: dict[str, Any], page_idx: int, element_metadata: ElementMetadata, page_layout_width: int, page_layout_height: int) -> Element | None:
22
+ """Extract value (image in base64 format and other metadata) from image element.
23
+
24
+ This method defines the process of extracting Image value in base64 format from image element.
25
+
26
+ Args:
27
+ image_instance (dict): The image instance.
28
+ page_idx (int): The number of the page index.
29
+ element_metadata (ElementMetadata): The element metadata.
30
+ page_layout_width (int): The width of the page layout.
31
+ page_layout_height (int): The height of the page layout.
32
+
33
+ Returns:
34
+ Element | None: An Element object containing image in base64 format and metadata.
35
+ None if the image is empty.
36
+ """
37
+ def find_related_link(text_rect: list[float], links: list[dict[str, Any]]) -> dict[str, Any] | None:
38
+ """Find the related link for a text rectangle.
39
+
40
+ This method finds the related link for a text rectangle. It will return the link if the text
41
+ rectangle intersects with the link rectangle.
42
+
43
+ Args:
44
+ text_rect (list[float]): The text rectangle.
45
+ links (list[dict[str, Any]]): A list of links.
46
+
47
+ Returns:
48
+ dict[str, Any] | None: The related link if the text rectangle intersects with the link rectangle
49
+ or None if the text rectangle does not intersect with the link rectangle.
50
+ """
51
+ def convert_page_to_image_base64(page: fitz.Page, dpi: int = 150) -> str:
52
+ """Convert a PDF page to a base64 encoded PNG image.
53
+
54
+ Args:
55
+ page (fitz.Page): The PDF page to convert.
56
+ dpi (int, optional): Rendering resolution for each PDF page in Dots Per Inch (DPI).
57
+ Higher values produce higher-quality images but increase memory usage. Defaults to 150.
58
+
59
+ Returns:
60
+ str: Base64 encoded PNG image.
61
+ """
62
+ def create_page_element(page_image_base64: str, page_number: int, page: fitz.Page, base_element_metadata: ElementMetadata, structure: str = ...) -> Element:
63
+ """Create a page element with the specified structure.
64
+
65
+ This function creates an Element representing a page with its image content.
66
+ It can be used to create either IMAGE or PAGE structure elements.
67
+
68
+ Args:
69
+ page_image_base64 (str): The page image in base64 format.
70
+ page_number (int): The page number.
71
+ page (fitz.Page): The PDF page.
72
+ base_element_metadata (ElementMetadata): The base element metadata.
73
+ structure (str, optional): The element structure type. Defaults to IMAGE.
74
+
75
+ Returns:
76
+ Element: The page element with the specified structure.
77
+ """
@@ -0,0 +1,32 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
6
+ from typing import Any
7
+
8
+ class TabulaLoader(BaseLoader):
9
+ """A class for loading PDF and extracting Table from PDF using Tabula.
10
+
11
+ This class defines the structure for loading PDF and extracting Table from PDF using Tabula.
12
+ It implements the 'load' method to handle the loading and extraction process.
13
+
14
+ TabulaLoader is used to extract the TABLE and metadata from the PDF document.
15
+
16
+ Methods:
17
+ load(source, loaded_elements, **kwargs): Load the PDF file and extract Table from PDF using Tabula.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load the PDF file and extract Table from PDF using Tabula.
21
+
22
+ Args:
23
+ source (str): The file path of the PDF document.
24
+ loaded_elements (list[dict[str, Any]]): A list of loaded elements from the PDF document.
25
+ **kwargs (Any): Additional keyword arguments.
26
+
27
+ Kwargs:
28
+ original_source (str, optional): The original source of the document.
29
+
30
+ Returns:
31
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
32
+ """
@@ -0,0 +1,37 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.loader.pdf.pdf_loader_utils import merge_loaded_elements_by_coordinates as merge_loaded_elements_by_coordinates
4
+ from gllm_docproc.loader.pdf.pdf_miner_word_loader import PDFMinerWordLoader as PDFMinerWordLoader
5
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PDF as PDF
7
+ from typing import Any
8
+
9
+ class TextInjectPDFPlumberLoader(BaseLoader):
10
+ """A class for loading PDF documents using PDFPlumber by injecting text into tables.
11
+
12
+ This class defines the structure for loading PDF documents using PDFPlumber by injecting text into tables.
13
+ It implements the 'load' method to handle PDF loading from a given file path.
14
+
15
+ TextInjectPDFPlumberLoader is used to extract the TABLE from the PDF document.
16
+
17
+ Methods:
18
+ load(source, loaded_elements, **kwargs): Load a PDF document.
19
+ """
20
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
21
+ """Load a PDF document.
22
+
23
+ This method loads a PDF document from a given file path.
24
+
25
+ Args:
26
+ source (str): The file path of the PDF document.
27
+ loaded_elements (list[dict[str, Any]]): The loaded elements.
28
+ kwargs (Any): Additional keyword arguments.
29
+
30
+ Kwargs:
31
+ original_source (str, optional): The original source of the document.
32
+ font_size_threshold (int, optional): The font size threshold. Defaults to None.
33
+ When None, the font size threshold will be most frequent font size multiplied by 2.
34
+
35
+ Returns:
36
+ list[dict[str, Any]]: The loaded elements.
37
+ """
@@ -0,0 +1,48 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_datastore.cache.hybrid_cache.hybrid_cache import BaseHybridCache
3
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
4
+ from typing import Any
5
+
6
+ class PipelineLoader:
7
+ """A pipeline loader for loading documents.
8
+
9
+ This class serves as the pipeline loader for loading document. It defines the structure for
10
+ loading document with several loaders using pipeline.
11
+
12
+ Methods:
13
+ add_loader(loader): Add loader to the pipeline loader.
14
+ load(source, **kwargs): Load the document from the given source.
15
+ """
16
+ loaders: list[BaseLoader]
17
+ cache_data_store: Incomplete
18
+ logger: Incomplete
19
+ def __init__(self, cache_data_store: BaseHybridCache | None = None) -> None:
20
+ """Initialize the PipelineLoader.
21
+
22
+ Args:
23
+ cache_data_store (BaseHybridCache, optional): The cache data store to be used.
24
+ Defaults to None.
25
+ """
26
+ def add_loader(self, loader: BaseLoader):
27
+ """Add loader to the pipeline loader.
28
+
29
+ This method defines the process of adding loader to the pipeline loader.
30
+
31
+ Args:
32
+ loader (BaseLoader): The loader to be added.
33
+ """
34
+ def load(self, source: str, **kwargs: Any) -> list[dict[str, Any]]:
35
+ """Load the document from the given file path.
36
+
37
+ This method defines the process of loading the document using loaders.
38
+
39
+ Args:
40
+ source (str): Might be file path, URL, the content itself.
41
+ **kwargs (Any): Additional keyword arguments.
42
+
43
+ Kwargs:
44
+ ttl (int, optional): The TTL of the cache. Defaults to None.
45
+
46
+ Returns:
47
+ List[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
48
+ """
@@ -0,0 +1,3 @@
1
+ from .python_pptx_loader import PythonPPTXLoader as PythonPPTXLoader
2
+
3
+ __all__ = ['PythonPPTXLoader']
@@ -0,0 +1,48 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.model.element import Element as Element, IMAGE as IMAGE, PARAGRAPH as PARAGRAPH, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, PPTX as PPTX
6
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
7
+ from typing import Any
8
+
9
+ UNTITLED_CHART: str
10
+
11
+ class PythonPPTXLoader(BaseLoader):
12
+ """A class for loading and processing PPTX documents using PythonPPTXLoader.
13
+
14
+ This class defines the structure for loading and processing a PPTX
15
+ document to retrieve the content from its slides.
16
+ It implements the 'load' method to handle PPTX loading from a given file path.
17
+
18
+ PythonPPTXLoader is used to extract individual slides and
19
+ their contents (such as text, tables, and images) and metadata from the PPTX document.
20
+
21
+ Methods:
22
+ load(source, loaded_elements, **kwargs): Load a PPTX document.
23
+ """
24
+ include_hidden_slides: Incomplete
25
+ logger: Incomplete
26
+ def __init__(self, include_hidden_slides: bool = False) -> None:
27
+ """Initialize loader with option to include hidden slides.
28
+
29
+ Args:
30
+ include_hidden_slides (bool, optional): Whether to include hidden slides during loading. Defaults to False.
31
+ """
32
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
33
+ """Load and process a PPTX document specified by the file path and name (source).
34
+
35
+ This method defines the process of loading a PPTX document using its file path.
36
+ It is responsible for extracting content from each slide, such as text, tables, and images.
37
+
38
+ Args:
39
+ source (str): The path to the PPTX document file.
40
+ loaded_elements (list[dict[str, Any]]): A list of dictionaries containing loaded content and metadata.
41
+ **kwargs (Any): Additional keyword arguments for the loader.
42
+
43
+ Kwargs:
44
+ original_source (str, optional): The original source of the document.
45
+
46
+ Returns:
47
+ list[dict[str, Any]]: A list of dictionaries containing loaded content and metadata.
48
+ """
@@ -0,0 +1,3 @@
1
+ from .txt_loader import TXTLoader as TXTLoader
2
+
3
+ __all__ = ['TXTLoader']
@@ -0,0 +1,55 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata
4
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import TXT as TXT
6
+ from typing import Any
7
+
8
+ class TXTLoader(BaseLoader):
9
+ """A class for loading text files (.txt) into a list of elements.
10
+
11
+ Methods:
12
+ load: Load a text file into a list of elements.
13
+ is_text_file: Check if a file is a text file.
14
+ """
15
+ DEFAULT_SUPPORTED_PREFIX_MIME_TYPES: Incomplete
16
+ logger: Incomplete
17
+ supported_prefix_mime_types: Incomplete
18
+ def __init__(self, supported_prefix_mime_types: list[str] | None = None) -> None:
19
+ """Initialize the TXTLoader class.
20
+
21
+ Args:
22
+ supported_prefix_mime_types (list[str] | None, optional): The list of supported prefix mime types.
23
+ """
24
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
25
+ """Load a text file into a list of elements.
26
+
27
+ This method loads a text file into a list of elements.
28
+ It uses the `is_text_file` method to check if the file is a text file.
29
+ If the file is not a text file, the method will raise a ValueError.
30
+
31
+ Args:
32
+ source (str): The path to the text file.
33
+ loaded_elements (list[dict[str, Any]]): The list of elements that have already been loaded.
34
+ **kwargs: Additional keyword arguments.
35
+
36
+ Kwargs:
37
+ original_source (str, optional): The original source of the document.
38
+
39
+ Returns:
40
+ list[dict[str, Any]]: A list of elements.
41
+
42
+ Raises:
43
+ ValueError: If the file is not a text file.
44
+ """
45
+ def is_text_file(self, source: str) -> bool:
46
+ """Check if a file is a text file.
47
+
48
+ This method uses the `magic` library to check if a file is a text file.
49
+
50
+ Args:
51
+ source (str): The path to the file.
52
+
53
+ Returns:
54
+ bool: True if the file is a text file, False otherwise.
55
+ """
@@ -0,0 +1,3 @@
1
+ from .video_transcript_loader import VideoTranscriptLoader as VideoTranscriptLoader
2
+
3
+ __all__ = ['VideoTranscriptLoader']
@@ -0,0 +1,97 @@
1
+ import numpy as np
2
+ from _typeshed import Incomplete
3
+ from gllm_docproc.loader.exception import VideoConversionError as VideoConversionError
4
+
5
+ logger: Incomplete
6
+ DEFAULT_SAMPLE_RATE: int
7
+ AUDIO_NORMALIZATION_FACTOR: float
8
+ PCM_16_MAX_VALUE: int
9
+ INT16_TO_FLOAT_DIVISOR: float
10
+ GSTREAMER_MAX_BUFFERS: int
11
+
12
+ def is_supported_video_file(file_path: str) -> bool:
13
+ """Validate if a file is a supported video format using intelligent MIME detection.
14
+
15
+ Args:
16
+ file_path (str): Absolute or relative path to the file to validate.
17
+
18
+ Returns:
19
+ bool: True if the file is a supported video format, False otherwise.
20
+ """
21
+ def convert_video_to_audio_bytes(video_path: str, target_sample_rate: int | None = ..., normalize_audio: bool = True, audio_format: str = 'WAV', audio_subtype: str | None = None) -> bytes:
22
+ '''Convert video file to audio bytes optimized for audio transcription services using GStreamer.
23
+
24
+ This function provides a complete video-to-audio processing pipeline that:
25
+ 1. Extracts audio from video using GStreamer
26
+ 2. Processes and normalizes the audio signal
27
+ 3. Resamples to target sample rate (optional)
28
+ 4. Encodes to specified audio format with intelligent defaults
29
+ 5. Returns audio bytes ready for transcription services or file output
30
+
31
+ Args:
32
+ video_path (str): Path to the input video file.
33
+
34
+ target_sample_rate (int | None, optional): Target sample rate for the audio output.
35
+ Common values:
36
+ - 16000 Hz: Optimal for speech recognition (default)
37
+ - 44100 Hz: CD quality, good for music transcription
38
+ - 48000 Hz: Professional audio standard
39
+ - None: Preserve original video\'s sample rate
40
+
41
+ Higher sample rates provide better quality but larger file sizes.
42
+ Most transcription services work best with 16kHz. Defaults to 16000.
43
+
44
+ normalize_audio (bool, optional): Whether to normalize audio amplitude.
45
+ Set to False if the transcription service handles its own normalization. Defaults to True.
46
+
47
+ audio_format (str, optional): Target audio format for the output. Case-insensitive.
48
+ **Recommended formats for transcription:**
49
+ - "WAV": Uncompressed, maximum compatibility (default)
50
+
51
+ See `audio_to_bytes()` documentation for complete format list.
52
+ Defaults to "WAV".
53
+
54
+ audio_subtype (str | None, optional): Audio encoding subtype.
55
+ If None, uses format defaults: WAV→"PCM_16", FLAC→"PCM_16", MP3→"MPEG_LAYER_III", OGG→"VORBIS".
56
+ Common options: "PCM_16", "PCM_24", "PCM_32", "FLOAT".
57
+ Defaults to None.
58
+
59
+ Returns:
60
+ bytes: Audio data in the specified format, ready for transcription services or file output.
61
+
62
+ Raises:
63
+ FileNotFoundError: If the video file doesn\'t exist.
64
+ ValueError: If unsupported format, no audio stream, or invalid format-subtype combination.
65
+ VideoConversionError: For other conversion failures (decoding, processing, encoding errors).
66
+
67
+ Examples:
68
+ >>> audio_bytes = convert_video_to_audio_bytes("meeting.mp4") # Default: WAV, 16kHz
69
+ >>> audio_bytes = convert_video_to_audio_bytes("concert.mov", target_sample_rate=44100, audio_format="FLAC")
70
+ >>> audio_bytes = convert_video_to_audio_bytes("lecture.avi", audio_format="MP3") # Smaller files
71
+ '''
72
+ def audio_to_bytes(audio_data: np.ndarray, sample_rate: int, audio_format: str = 'WAV', subtype: str | None = None) -> bytes:
73
+ '''Convert audio data to specified format bytes.
74
+
75
+ Args:
76
+ audio_data (np.ndarray): Audio data as NumPy array.
77
+ sample_rate (int): Sample rate in Hz (e.g., 16000, 44100).
78
+ audio_format (str, optional): Target format. Supports WAV, FLAC, MP3, OGG, AIFF, etc.
79
+ See `soundfile.available_formats()` for the complete list.
80
+ Defaults to "WAV".
81
+ subtype (str | None, optional): Audio encoding subtype.
82
+ If None, uses soundfile\'s format defaults: WAV→"PCM_16", FLAC→"PCM_16", MP3→"MPEG_LAYER_III", OGG→"VORBIS".
83
+ See `soundfile.available_subtypes(format)` for the complete list.
84
+ Defaults to None.
85
+
86
+ Returns:
87
+ bytes: Audio data in the specified format.
88
+
89
+ Raises:
90
+ ValueError: If unsupported format/subtype or incompatible format-subtype combination.
91
+
92
+ Examples:
93
+ >>> audio_data = np.array([0.1, -0.2, 0.3], dtype=np.float32)
94
+ >>> wav_bytes = audio_to_bytes(audio_data, 16000) # WAV + PCM_16
95
+ >>> flac_bytes = audio_to_bytes(audio_data, 44100, "FLAC") # FLAC + PCM_16
96
+ >>> mp3_bytes = audio_to_bytes(audio_data, 16000, "MP3") # MP3 + MPEG_LAYER_III
97
+ '''
@@ -0,0 +1,59 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata
4
+ from gllm_docproc.loader.video.video_loader_utils import convert_video_to_audio_bytes as convert_video_to_audio_bytes, is_supported_video_file as is_supported_video_file
5
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, VIDEO as VIDEO
7
+ from gllm_docproc.utils import run_async_in_sync as run_async_in_sync
8
+ from gllm_multimodal.modality_converter.audio_to_text.audio_to_text import BaseAudioToText as BaseAudioToText
9
+ from gllm_multimodal.modality_converter.schema import AudioTranscript as AudioTranscript
10
+ from typing import Any
11
+
12
+ class VideoTranscriptLoader(BaseLoader):
13
+ """Video Transcript Loader class for comprehensive video processing.
14
+
15
+ This class provides a complete pipeline for processing video files by:
16
+ - Converting video to audio using GStreamer (supports MP4, AVI, MOV, MKV, WebM, etc.)
17
+ - Transcribing audio using configurable Audio-to-Text converters
18
+ - Processing transcript elements with enhanced metadata
19
+ - Converting structured elements to the GLLM format
20
+
21
+ Attributes:
22
+ audio_to_text_converters (list[BaseAudioToText]): List of audio transcription converters.
23
+ logger (Logger): Logger instance for the class.
24
+ """
25
+ logger: Incomplete
26
+ audio_to_text_converters: Incomplete
27
+ def __init__(self, audio_to_text_converters: list[BaseAudioToText] | None = None) -> None:
28
+ """Initialize the VideoTranscriptLoader class.
29
+
30
+ Args:
31
+ audio_to_text_converters (list[BaseAudioToText], optional): List of audio-to-text converters.
32
+ Defaults to OpenAI Whisper if not provided.
33
+ """
34
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> list[dict[str, Any]]:
35
+ """Load and process video file through the transcription pipeline.
36
+
37
+ This method processes a video file by:
38
+ 1. Converting video to audio using GStreamer
39
+ 2. Transcribing audio to text using configured Audio-to-Text converters
40
+ 3. Converting transcripts to structured Element objects
41
+ 4. Returning elements as a list of dictionaries
42
+
43
+ Supports any video format that GStreamer can handle (MP4, AVI, MOV, MKV, WebM, etc.).
44
+
45
+ Args:
46
+ source (str): Path to the video file.
47
+ loaded_elements (Any): The loaded elements from previous loaders (not used in this implementation).
48
+ **kwargs (Any): Additional keyword arguments (currently not used).
49
+
50
+ Kwargs:
51
+ original_source (str, optional): The original source of the document.
52
+
53
+ Returns:
54
+ list[dict[str, Any]]: List of video transcript elements as dictionaries.
55
+
56
+ Raises:
57
+ FileNotFoundError: If the video file doesn't exist.
58
+ ValueError: If the file is not a supported video format or if conversion/transcription fails.
59
+ """
@@ -0,0 +1,3 @@
1
+ from .openpyxl_loader import OpenpyxlLoader as OpenpyxlLoader
2
+
3
+ __all__ = ['OpenpyxlLoader']
@@ -0,0 +1,36 @@
1
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
2
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
3
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, XLSX as XLSX
5
+ from typing import Any
6
+
7
+ class OpenpyxlLoader(BaseLoader):
8
+ """A class used to load and process XLSX documents using the openpyxl library.
9
+
10
+ This class inherits from the BaseLoader class and overrides its methods to provide
11
+ functionality for loading XLSX documents. It provides methods to extract tables from
12
+ the document, determine whether a row is a header based on its style attributes, and
13
+ split a table into headers and body based on the row styles and header threshold.
14
+
15
+ Methods:
16
+ load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any)
17
+ -> list[dict[str, Any]]: Load a XLSX document.
18
+ """
19
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
20
+ """Load a XLSX document.
21
+
22
+ This method loads a XLSX document and extracts the table elements from each sheet.
23
+ The method takes the source file path as input and returns a list of Element objects
24
+ representing the tables in the document.
25
+
26
+ Args:
27
+ source (str): The path to the XLSX document to load.
28
+ loaded_elements (list[dict[str, Any]]): The loaded elements from previous loaders.
29
+ **kwargs (Any): Additional keyword arguments to pass to the loader.
30
+
31
+ Kwargs:
32
+ original_source (str, optional): The original source of the document.
33
+
34
+ Returns:
35
+ list[dict[str, Any]]: The loaded elements from the XLSX document.
36
+ """
@@ -0,0 +1,7 @@
1
+ from .element import Element as Element
2
+ from .element_metadata import ElementMetadata as ElementMetadata
3
+ from .loader_type import LoaderType as LoaderType
4
+ from .media import Media as Media
5
+ from .parser_type import ParserType as ParserType
6
+
7
+ __all__ = ['Element', 'ElementMetadata', 'LoaderType', 'Media', 'ParserType']