gllm-docproc-binary 0.7.22__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (167) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +6 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  28. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  29. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  38. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  39. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  40. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  41. gllm_docproc/dpo_router/__init__.pyi +5 -0
  42. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  43. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  44. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  45. gllm_docproc/housekeeping/__init__.pyi +3 -0
  46. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  47. gllm_docproc/indexer/__init__.pyi +3 -0
  48. gllm_docproc/indexer/base_indexer.pyi +30 -0
  49. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  50. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  51. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  52. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  53. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  54. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  55. gllm_docproc/loader/__init__.pyi +4 -0
  56. gllm_docproc/loader/audio/__init__.pyi +3 -0
  57. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  58. gllm_docproc/loader/base_loader.pyi +30 -0
  59. gllm_docproc/loader/csv/__init__.pyi +3 -0
  60. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  61. gllm_docproc/loader/docx/__init__.pyi +5 -0
  62. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  63. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  64. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  65. gllm_docproc/loader/exception/__init__.pyi +4 -0
  66. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  67. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  68. gllm_docproc/loader/html/__init__.pyi +5 -0
  69. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  70. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  71. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  72. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
  73. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  74. gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
  75. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  76. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  77. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  78. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  79. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  80. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  81. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  82. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  83. gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
  84. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  85. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  86. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  87. gllm_docproc/loader/image/__init__.pyi +3 -0
  88. gllm_docproc/loader/image/image_loader.pyi +54 -0
  89. gllm_docproc/loader/json/__init__.pyi +3 -0
  90. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  91. gllm_docproc/loader/loader_utils.pyi +43 -0
  92. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  93. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  94. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  96. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  97. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  98. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  99. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  100. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  101. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  102. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  103. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  104. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  105. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  106. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  107. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  108. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  109. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  110. gllm_docproc/loader/txt/__init__.pyi +3 -0
  111. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  112. gllm_docproc/loader/video/__init__.pyi +3 -0
  113. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  114. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  115. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  116. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  117. gllm_docproc/model/__init__.pyi +7 -0
  118. gllm_docproc/model/element.pyi +38 -0
  119. gllm_docproc/model/element_metadata.pyi +35 -0
  120. gllm_docproc/model/loader_type.pyi +20 -0
  121. gllm_docproc/model/media.pyi +51 -0
  122. gllm_docproc/model/parser_type.pyi +19 -0
  123. gllm_docproc/parser/__init__.pyi +4 -0
  124. gllm_docproc/parser/base_parser.pyi +28 -0
  125. gllm_docproc/parser/document/__init__.pyi +7 -0
  126. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  127. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  128. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  129. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  130. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  131. gllm_docproc/parser/html/__init__.pyi +4 -0
  132. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  133. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  134. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  135. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  136. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  137. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  138. gllm_docproc/parser/image/__init__.pyi +4 -0
  139. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  140. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  141. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  142. gllm_docproc/parser/table/__init__.pyi +3 -0
  143. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  144. gllm_docproc/request_handler/__init__.pyi +3 -0
  145. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  146. gllm_docproc/response_handler/__init__.pyi +3 -0
  147. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  148. gllm_docproc/utils/__init__.pyi +3 -0
  149. gllm_docproc/utils/async_utils.pyi +22 -0
  150. gllm_docproc/utils/file_utils.pyi +76 -0
  151. gllm_docproc/utils/html_constants.pyi +122 -0
  152. gllm_docproc/validator/__init__.pyi +6 -0
  153. gllm_docproc/validator/base_validator.pyi +34 -0
  154. gllm_docproc/validator/character_count_validator.pyi +26 -0
  155. gllm_docproc/validator/file_size_validator.pyi +20 -0
  156. gllm_docproc/validator/model/__init__.pyi +4 -0
  157. gllm_docproc/validator/model/validator_input.pyi +50 -0
  158. gllm_docproc/validator/model/validator_result.pyi +19 -0
  159. gllm_docproc/validator/page_count_validator.pyi +23 -0
  160. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  161. gllm_docproc.build/.gitignore +1 -0
  162. gllm_docproc.cp311-win_amd64.pyd +0 -0
  163. gllm_docproc.pyi +220 -0
  164. gllm_docproc_binary-0.7.22.dist-info/METADATA +216 -0
  165. gllm_docproc_binary-0.7.22.dist-info/RECORD +167 -0
  166. gllm_docproc_binary-0.7.22.dist-info/WHEEL +5 -0
  167. gllm_docproc_binary-0.7.22.dist-info/top_level.txt +1 -0
File without changes
@@ -0,0 +1,3 @@
1
+ from .base_chunker import BaseChunker as BaseChunker
2
+
3
+ __all__ = ['BaseChunker']
@@ -0,0 +1,28 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseChunker(ABC):
5
+ """An abstract base class for chunker.
6
+
7
+ This class segmenting or chunking elements based on contextual information.
8
+ Subclasses are expected to implement the 'chunk' method to handle chunking elements.
9
+
10
+ Methods:
11
+ chunk(elements, **kwargs): Abstract method to chunk a document.
12
+ """
13
+ @abstractmethod
14
+ def chunk(self, elements: Any, **kwargs: Any) -> Any:
15
+ """Chunk a document.
16
+
17
+ This method is abstract and must be implemented in subclasses.
18
+ It defines the process of chunking information from elements.
19
+
20
+ Args:
21
+ elements (Any): The information to be chunked. ideally formatted as List[Dict].
22
+ **kwargs (Any): Additional keyword arguments for customization.
23
+
24
+ Returns:
25
+ Any: The chunked information, ideally formatted as List[Dict]. Each dictionary within
26
+ the list are recommended to follows the structure of model 'Element',
27
+ to ensure consistency and ease of use across Document Processing Orchestrator.
28
+ """
@@ -0,0 +1,3 @@
1
+ from .structured_element_chunker import StructuredElementChunker as StructuredElementChunker
2
+
3
+ __all__ = ['StructuredElementChunker']
@@ -0,0 +1,43 @@
1
+ from gllm_docproc.model.element import Element as Element
2
+ from gllm_docproc.model.element_metadata import AUDIO as AUDIO, PDF as PDF, VIDEO as VIDEO
3
+
4
+ def enrich_chunk(chunk: Element, elements: list[Element]) -> Element:
5
+ """Enrich the chunk with information from the original elements.
6
+
7
+ This is the default enrichment function for structured element chunker.
8
+ The function enrich the chunk with information from the original elements.
9
+ Based on the source type, the information that we want to keep are different.
10
+
11
+ Args:
12
+ chunk (Element): The chunk to be enriched.
13
+ elements (list[Element]): The original elements that form the chunk.
14
+
15
+ Returns:
16
+ Element: The enriched chunk.
17
+ """
18
+ def enrich_pdf_chunk(chunk: Element, elements: list[Element]) -> Element:
19
+ """The default function for enriching the PDF chunk.
20
+
21
+ The function enriches the PDF chunk with the coordinates and page_number information
22
+ of the original elements.
23
+
24
+ Args:
25
+ chunk (Element): The PDF chunk to be enriched.
26
+ elements (list[Element]): The original elements that form the chunk.
27
+
28
+ Returns:
29
+ Element: The enriched PDF chunk.
30
+ """
31
+ def enrich_audio_chunk(chunk: Element, elements: list[Element]) -> Element:
32
+ """The default function for enriching the audio chunk.
33
+
34
+ The function enriches the audio chunk by replacing the double newlines with a single newline.
35
+ Then, it adds the start_time, end_time, and lang_id information of the original elements.
36
+
37
+ Args:
38
+ chunk (Element): The audio chunk to be enriched.
39
+ elements (list[Element]): The original elements that form the chunk.
40
+
41
+ Returns:
42
+ Element: The enriched audio chunk.
43
+ """
@@ -0,0 +1,80 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
3
+ from gllm_docproc.chunker.structured_element.chunk_enricher import enrich_chunk as enrich_chunk
4
+ from gllm_docproc.chunker.table import TableChunker as TableChunker
5
+ from gllm_docproc.model.element import AUDIO as AUDIO, Element as Element, FOOTER as FOOTER, FOOTNOTE as FOOTNOTE, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, PAGE as PAGE, TABLE as TABLE, TITLE as TITLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT, VIDEO as VIDEO
6
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from typing import Any
9
+
10
+ NON_TEXT_STRUCTURE: Incomplete
11
+
12
+ def default_text_splitter() -> RecursiveCharacterTextSplitter:
13
+ '''Define the default text splitter for structured text chunking.
14
+
15
+ This function defines the default text splitter for structured text chunking.
16
+ The text splitter is defined with the following separators:
17
+
18
+ 1. "\\n#" : Split by Title or Heading
19
+ 2. "\\n\\n" : Split between Paragraph Elements
20
+ 3. "\\n" : Split between Title/Heading and Paragraph Elements
21
+ 4. ". " | "! " | "? " : Split by Sentence
22
+ 5. ", " : Split by Word
23
+ 6. " " : Split by Word
24
+ 7. "" : Split by Character
25
+
26
+ Returns:
27
+ RecursiveCharacterTextSplitter: A RecursiveCharacterTextSplitter object for structured text chunking.
28
+ '''
29
+
30
+ class StructuredElementChunker(BaseChunker):
31
+ """A class for structured text chunker.
32
+
33
+ This class defines the structure for chunking structured text into smaller chunks. It implements
34
+ the 'chunk' method to handle structured text chunking.
35
+
36
+ Methods:
37
+ chunk(elements, **kwargs): Chunk the structured text into smaller chunks.
38
+ """
39
+ default_text_splitter: Incomplete
40
+ default_table_chunker: Incomplete
41
+ text_splitter: Incomplete
42
+ table_chunker: Incomplete
43
+ is_parent_structure_info_included: Incomplete
44
+ def __init__(self, text_splitter: RecursiveCharacterTextSplitter = ..., table_chunker: BaseChunker = ..., is_parent_structure_info_included: bool = True) -> None:
45
+ """Initialize the structured text chunker.
46
+
47
+ Args:
48
+ text_splitter (RecursiveCharacterTextSplitter): A RecursiveCharacterTextSplitter object
49
+ for structured text chunking.
50
+ table_chunker (BaseChunker): A BaseChunker object for table chunking.
51
+ is_parent_structure_info_included (bool): A boolean value to include parent structure
52
+ information in the chunk.
53
+ """
54
+ def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
55
+ '''Chunk the structured text into smaller chunks.
56
+
57
+ This method defines the process of chunking structured text into smaller chunks. It uses the
58
+ RecursiveCharacterTextSplitter to split the text into chunks based on the defined separators.
59
+
60
+ The method will split the text recursively based on the defined separators, or by default:
61
+ 1. "\\n#" : Split by Title or Heading
62
+ 2. "\\n\\n" : Split between Paragraph Elements
63
+ 3. "\\n" : Split between Title/Heading and Paragraph Elements
64
+ 4. ". " | "! " | "? " : Split by Sentence
65
+ 5. ", " : Split by Word
66
+ 6. " " : Split by Word
67
+ 7. "" : Split by Character
68
+
69
+ Kwargs:
70
+ excluded_structures (list[str]): A list of structures to be excluded from the chunking process.
71
+ enrich_chunk (Callable[[Element, list[Element]], Element]): A function to enrich the chunked element.
72
+ file_id (str | None): The file id of the chunked elements. Defaults to None.
73
+
74
+ Args:
75
+ elements (list[dict[str, any]]): A list of dictionaries containing text and structure.
76
+ **kwargs (Any): Additional keyword arguments for the chunker.
77
+
78
+ Returns:
79
+ list[dict[str, Any]]: A list of dictionaries containing chunked text and metadata.
80
+ '''
@@ -0,0 +1,3 @@
1
+ from .table_chunker import CSV as CSV, HTML as HTML, MARKDOWN as MARKDOWN, TableChunker as TableChunker
2
+
3
+ __all__ = ['CSV', 'HTML', 'MARKDOWN', 'TableChunker']
@@ -0,0 +1,45 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.chunker.base_chunker import BaseChunker as BaseChunker
3
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata
5
+ from typing import Any
6
+
7
+ MARKDOWN: str
8
+ CSV: str
9
+ HTML: str
10
+
11
+ class TableChunker(BaseChunker):
12
+ """Table Chunker class.
13
+
14
+ This class is used to chunk a table element into smaller chunks. It implements the 'chunk' method
15
+ to handle chunking the table element based on the chunk size and overlap. The table is converted
16
+ into the expected format (markdown, csv, or html).
17
+
18
+ Methods:
19
+ chunk(elements, **kwargs): Chunk a table element into smaller chunks.
20
+ """
21
+ chunk_size: Incomplete
22
+ chunk_overlap: Incomplete
23
+ table_format: Incomplete
24
+ table_splitter: Incomplete
25
+ def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 0, table_format: str = ...) -> None:
26
+ """Initializes the TableChunker class.
27
+
28
+ Args:
29
+ chunk_size (int): The size of each chunk.
30
+ chunk_overlap (int): The overlap between each chunk.
31
+ table_format (str): The format of the table (markdown, csv, or html).
32
+ """
33
+ def chunk(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
34
+ """Chunk a table element into smaller chunks.
35
+
36
+ This method chunks a table element into smaller chunks based on the chunk size and overlap.
37
+ It converts the table into the expected format (markdown, csv, or html) and then chunks the table.
38
+
39
+ Args:
40
+ elements (list[dict[str, Any]]): The table element to be chunked.
41
+ **kwargs (Any): Additional keyword arguments for customization.
42
+
43
+ Returns:
44
+ list[dict[str, Any]]: The list of smaller chunks.
45
+ """
@@ -0,0 +1,3 @@
1
+ from .base_converter import BaseConverter as BaseConverter
2
+
3
+ __all__ = ['BaseConverter']
@@ -0,0 +1,15 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ class BaseConverter(ABC):
4
+ """Base class for document converter."""
5
+ @abstractmethod
6
+ def convert(self, path_input: str, path_output: str) -> None:
7
+ """Converts a document.
8
+
9
+ Args:
10
+ path_input (str): The path of the document to be converted.
11
+ path_output (str): The path of the converted document.
12
+
13
+ Returns:
14
+ None
15
+ """
@@ -0,0 +1,5 @@
1
+ from gllm_docproc.data_generator.base_data_generator import BaseDataGenerator as BaseDataGenerator
2
+ from gllm_docproc.data_generator.image_data_generator.image_caption_data_generator import ImageCaptionDataGenerator as ImageCaptionDataGenerator
3
+ from gllm_docproc.data_generator.image_data_generator.multi_model_image_caption_data_generator import MultiModelImageCaptionDataGenerator as MultiModelImageCaptionDataGenerator
4
+
5
+ __all__ = ['BaseDataGenerator', 'ImageCaptionDataGenerator', 'MultiModelImageCaptionDataGenerator']
@@ -0,0 +1,18 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseDataGenerator(ABC):
5
+ """Base class for data generator."""
6
+ @abstractmethod
7
+ def generate(self, elements: Any, **kwargs: Any) -> Any:
8
+ """Generates data for a list of chunks.
9
+
10
+ Args:
11
+ elements (Any): The elements to be used for generating data / metadata. ideally formatted as List[Dict].
12
+ **kwargs (Any): Additional keyword arguments for customization.
13
+
14
+ Returns:
15
+ Any: The generated data, ideally formatted as List[Dict]. Each dictionary within
16
+ the list are recommended to follows the structure of model 'Element',
17
+ to ensure consistency and ease of use across Document Processing Orchestrator.
18
+ """
@@ -0,0 +1,4 @@
1
+ from gllm_docproc.data_generator.image_data_generator.image_caption_data_generator import ImageCaptionDataGenerator as ImageCaptionDataGenerator
2
+ from gllm_docproc.data_generator.image_data_generator.multi_model_image_caption_data_generator import MultiModelImageCaptionDataGenerator as MultiModelImageCaptionDataGenerator
3
+
4
+ __all__ = ['ImageCaptionDataGenerator', 'MultiModelImageCaptionDataGenerator']
@@ -0,0 +1,40 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.data_generator.base_data_generator import BaseDataGenerator as BaseDataGenerator
3
+ from gllm_docproc.model.element import Element as Element, PAGE as PAGE
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, IMAGE as IMAGE
5
+ from gllm_multimodal.modality_converter.image_to_text.image_to_caption.image_to_caption import BaseImageToCaption
6
+ from typing import Any
7
+
8
+ class ImageCaptionDataGenerator(BaseDataGenerator):
9
+ """Data generator for creating captions from images using BaseImageToCaption."""
10
+ DEFAULT_ELEMENT_PROCESSING_LIMIT: int
11
+ image_to_caption: Incomplete
12
+ def __init__(self, image_to_caption: BaseImageToCaption) -> None:
13
+ """Initialize the ImageCaptionDataGenerator.
14
+
15
+ Args:
16
+ image_to_caption (BaseImageToCaption): The image to caption converter instance.
17
+ """
18
+ def generate(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
19
+ """Generates captions by processing images in the input elements.
20
+
21
+ Args:
22
+ elements (list[dict[str, Any]]): List of dictionaries containing image data.
23
+ Each dictionary should have an 'image_source' key with the image location.
24
+ **kwargs (Any): Additional keyword arguments for the image captioning process.
25
+
26
+ Kwargs:
27
+ image_format_func (Callable[[str, Element], str], optional): Function to format the caption text.
28
+ Defaults to None.
29
+ element_processing_limit (int, optional): The maximum number of elements to process at a time.
30
+ Defaults to 100.
31
+ use_image_text_as_context (bool, optional): Whether to use the image text as context.
32
+ If set to False, will use `image_description` instead. Defaults to False.
33
+
34
+ Returns:
35
+ list[dict[str, Any]]: List of dictionaries containing the processed image data.
36
+ Each dictionary will contain the original data.
37
+
38
+ Raises:
39
+ ValueError: If elements don't contain required image information.
40
+ """
@@ -0,0 +1,51 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.data_generator.base_data_generator import BaseDataGenerator as BaseDataGenerator
3
+ from gllm_docproc.data_generator.image_data_generator.image_caption_data_generator import ImageCaptionDataGenerator as ImageCaptionDataGenerator
4
+ from gllm_docproc.model.element import IMAGE as IMAGE
5
+ from typing import Any
6
+
7
+ DEFAULT_MODEL_ID: str
8
+
9
+ class MultiModelImageCaptionDataGenerator(BaseDataGenerator):
10
+ """Multi-model image captioning data generator with lazy initialization.
11
+
12
+ This class extends BaseDataGenerator to provide a data generator for image captioning that supports multiple models
13
+ with lazy initialization, to avoid API key validation during pipeline initialization.
14
+
15
+ Key Features:
16
+ 1. Supports multiple models in a single instance.
17
+ 2. Lazy initialization to avoid API key validation during initialization.
18
+ 3. Dynamic model selection at runtime.
19
+ """
20
+ model_api_keys: Incomplete
21
+ logger: Incomplete
22
+ def __init__(self, model_api_keys: dict[str, str] | None = None) -> None:
23
+ """Initialize the MultiModelImageCaptionDataGenerator.
24
+
25
+ Args:
26
+ model_api_keys (dict[str, str] | None, optional): Dictionary mapping model IDs to their API keys.
27
+ Defaults to None, in which case no API keys passed during LMInvoker initialization.
28
+ """
29
+ def generate(self, elements: list[dict[str, Any]], **kwargs: Any) -> list[dict[str, Any]]:
30
+ '''Generate captions for elements with image structure.
31
+
32
+ Args:
33
+ elements (list[dict[str, Any]]): List of dictionaries containing elements to be processed.
34
+ **kwargs (Any): Additional keyword arguments for the image captioning process.
35
+
36
+ Kwargs:
37
+ model_id (str, optional): The ID of the model to use for image captioning.
38
+ Defaults to DEFAULT_MODEL_ID which is using the "google/gemini-2.5-flash".
39
+ system_prompt (str, optional): The system prompt to use for image captioning.
40
+ Defaults to DEFAULT_SYSTEM_PROMPT.
41
+ user_prompt (str, optional): The user prompt to use for image captioning.
42
+ Defaults to DEFAULT_USER_PROMPT.
43
+ default_hyperparameters (dict[str, Any]): Additional hyperparameters passed to
44
+ the LMInvoker configuration. Defaults to {}.
45
+ retry_config (dict[str, Any], optional): The retry config to use for the LM invoker.
46
+ If not provided, will use the default retry config. Defaults to {}.
47
+
48
+ Returns:
49
+ list[dict[str, Any]]: List of dictionaries containing the processed image data.
50
+ Each dictionary will contain the original data.
51
+ '''
@@ -0,0 +1 @@
1
+ from .pii_text_anonymization_data_generator import PIITextAnonymizationDataGenerator as PIITextAnonymizationDataGenerator
@@ -0,0 +1,5 @@
1
+ from .base_downloader import BaseDownloader as BaseDownloader
2
+ from .direct_file_url_downloader import DirectFileURLDownloader as DirectFileURLDownloader
3
+ from .google_drive_downloader import GoogleDriveDownloader as GoogleDriveDownloader
4
+
5
+ __all__ = ['BaseDownloader', 'DirectFileURLDownloader', 'GoogleDriveDownloader']
@@ -0,0 +1,19 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseDownloader(ABC):
5
+ """Base class for document downloader."""
6
+ @abstractmethod
7
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str] | None:
8
+ """Download source to the output directory.
9
+
10
+ Args:
11
+ source (str): The source to be downloaded.
12
+ output (str): The output directory where the downloaded source will be saved.
13
+ **kwargs (Any): Additional keyword arguments.
14
+
15
+ Returns:
16
+ list[str] | None: A list of file paths of successfully downloaded files.
17
+ If no files are downloaded, an empty list should be returned.
18
+ Returning None is only for backward compatibility and should be avoided in new implementations.
19
+ """
@@ -0,0 +1,40 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader import BaseDownloader as BaseDownloader
3
+ from typing import Any
4
+
5
+ UNKNOWN_EXTENSION: str
6
+ FALLBACK_EXTENSION: str
7
+ MIME_TYPE_TO_EXTENSION_MAP: Incomplete
8
+
9
+ class DirectFileURLDownloader(BaseDownloader):
10
+ """A class for downloading files from a direct file URL to the defined output directory."""
11
+ stream_buffer_size: Incomplete
12
+ max_retries: Incomplete
13
+ timeout: Incomplete
14
+ session: Incomplete
15
+ logger: Incomplete
16
+ def __init__(self, stream_buffer_size: int = 65536, max_retries: int = 3, timeout: int | None = None) -> None:
17
+ """Initialize the DirectFileURLDownloader.
18
+
19
+ Args:
20
+ stream_buffer_size (int, optional): The size of the buffer for streaming downloads in bytes.
21
+ Defaults to 64KB (65536 bytes).
22
+ max_retries (int, optional): The maximum number of retries for failed downloads. Defaults to 3.
23
+ timeout (int | None, optional): The timeout for the download request in seconds. Defaults to None.
24
+ """
25
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str]:
26
+ """Download source to the output directory.
27
+
28
+ Args:
29
+ source (str): The source to be downloaded.
30
+ output (str): The output directory where the downloaded source will be saved.
31
+ **kwargs (Any): Additional keyword arguments.
32
+
33
+ kwargs:
34
+ ca_certs_path (str, optional): The path to the CA certificates file. Defaults to None.
35
+ extension (str, optional): The extension of the file to be downloaded. If not provided,
36
+ the extension will be detected from the response headers or content mime type.
37
+
38
+ Returns:
39
+ list[str]: A list of file paths of successfully downloaded files.
40
+ """
@@ -0,0 +1,36 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader import BaseDownloader as BaseDownloader
3
+ from typing import Any
4
+
5
+ class GoogleDriveDownloader(BaseDownloader):
6
+ """A class for downloading files from Google Drive using BOSA connector for Google Drive integration."""
7
+ bosa: Incomplete
8
+ user: Incomplete
9
+ google_drive: Incomplete
10
+ logger: Incomplete
11
+ def __init__(self, api_key: str, identifier: str, secret: str, api_base_url: str = 'https://api.bosa.id') -> None:
12
+ '''Initialize the GoogleDriveDownloader.
13
+
14
+ Args:
15
+ api_key (str): The API key for the BOSA API.
16
+ identifier (str): The identifier for the BOSA user.
17
+ secret (str): The secret for the BOSA user.
18
+ api_base_url (str, optional): The base URL for the BOSA API. Defaults to "https://api.bosa.id".
19
+ '''
20
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str]:
21
+ """Download a file from Google Drive to the output directory.
22
+
23
+ Args:
24
+ source (str): The Google Drive file ID or URL.
25
+ output (str): The output directory where the downloaded file will be saved.
26
+ **kwargs (Any): Additional keyword arguments.
27
+
28
+ Kwargs:
29
+ export_format (str, optional): The export format for the file.
30
+
31
+ Returns:
32
+ list[str]: A list containing the path(s) to the successfully downloaded file(s).
33
+
34
+ Raises:
35
+ ValueError: If file ID cannot be extracted or no files are returned from Google Drive.
36
+ """
@@ -0,0 +1,6 @@
1
+ from . import utils as utils
2
+ from .firecrawl_downloader import HTMLFirecrawlDownloader as HTMLFirecrawlDownloader
3
+ from .html_downloader import HTMLDownloader as HTMLDownloader
4
+ from .requests_downloader import RequestsDownloader as RequestsDownloader
5
+
6
+ __all__ = ['HTMLDownloader', 'RequestsDownloader', 'HTMLFirecrawlDownloader', 'utils']
@@ -0,0 +1,4 @@
1
+ from .item_scrape_failed_exception import ItemScrapeFailedException as ItemScrapeFailedException
2
+ from .zyte_api_key_not_provided_exception import ZyteApiKeyNotProvidedException as ZyteApiKeyNotProvidedException
3
+
4
+ __all__ = ['ItemScrapeFailedException', 'ZyteApiKeyNotProvidedException']
@@ -0,0 +1,16 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class ItemScrapeFailedException(Exception):
4
+ """Exception raised when an item fails to be scraped.
5
+
6
+ Attributes:
7
+ message (str): Optional. The error message indicating the reason for the item scrape failure.
8
+ """
9
+ message: Incomplete
10
+ def __init__(self, message: str = 'Item failed to be scraped.') -> None:
11
+ '''Initialize the ItemScrapeFailedException.
12
+
13
+ Args:
14
+ message (str): Optional. The error message indicating the reason for the item scrape failure.
15
+ Defaults to "Item failed to be scraped."
16
+ '''
@@ -0,0 +1,15 @@
1
+ from _typeshed import Incomplete
2
+
3
+ class ZyteApiKeyNotProvidedException(Exception):
4
+ """Custom exception raised when the Zyte API key is not provided.
5
+
6
+ Attributes:
7
+ message (str): Optional. The error message associated with the exception.
8
+ """
9
+ message: Incomplete
10
+ def __init__(self, message: str = 'Zyte API Key not provided.') -> None:
11
+ '''Initialize the ZyteApiKeyNotProvidedException.
12
+
13
+ Args:
14
+ message (str, optional): The error message associated with the exception. Defaults to "Zyte API Key not provided."
15
+ '''
@@ -0,0 +1,49 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.downloader.base_downloader import BaseDownloader as BaseDownloader
3
+ from gllm_docproc.downloader.html.utils.web_utils import generate_filename_from_url as generate_filename_from_url
4
+ from gllm_docproc.model.element_metadata import ElementMetadata as ElementMetadata, HTML as HTML
5
+ from gllm_docproc.utils.file_utils import save_to_json as save_to_json
6
+ from typing import Any
7
+
8
+ class HTMLFirecrawlDownloader(BaseDownloader):
9
+ """A downloader class for downloading web content using Firecrawl.
10
+
11
+ This class inherits from the BaseDownloader class and provides methods to download web content using Firecrawl.
12
+
13
+ Attributes:
14
+ firecrawl_instance (FirecrawlApp): The Firecrawl instance.
15
+ """
16
+ logger: Incomplete
17
+ firecrawl_instance: Incomplete
18
+ def __init__(self, api_key: str, api_url: str | None = None) -> None:
19
+ """Initialize the Firecrawl downloader.
20
+
21
+ Args:
22
+ api_key (str): The API key for Firecrawl.
23
+ api_url (str, optional): The API URL for Firecrawl.
24
+ """
25
+ def download(self, source: str, output: str, **kwargs: Any) -> list[str]:
26
+ """Download content and save to file as JSON.
27
+
28
+ Args:
29
+ source (str): The URL to scrape.
30
+ output (str): The directory path where the downloaded content (in JSON format) will be saved.
31
+ **kwargs (Any): Additional arguments to pass to the scraper.
32
+
33
+ Kwargs:
34
+ formats (list[str], optional): The formats to scrape. Supported formats include:
35
+ - markdown: Returns the markdown content of the page.
36
+ - html: Returns the processed HTML content of the page.
37
+ - rawHtml: Provides the unmodified, raw HTML content of the entire webpage.
38
+ - screenshot: Returns a screenshot of the page.
39
+ - screenshot@fullPage: Returns a screenshot of the full page.
40
+ - links: Extracts and returns a list of all links found on the scraped page.
41
+ - json: Allows for structured data extraction, Need to use `json_options` to define
42
+ the schema for the output.
43
+
44
+ For a comprehensive list of supported formats, refer to the Firecrawl documentation.
45
+ Defaults to ['html'].
46
+
47
+ Returns:
48
+ list[str]: The list of full filepath of the created JSON files.
49
+ """