gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (168) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +7 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
  28. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  29. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  38. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  39. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  40. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  41. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  42. gllm_docproc/dpo_router/__init__.pyi +5 -0
  43. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  44. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  45. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  46. gllm_docproc/housekeeping/__init__.pyi +3 -0
  47. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  48. gllm_docproc/indexer/__init__.pyi +3 -0
  49. gllm_docproc/indexer/base_indexer.pyi +30 -0
  50. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  51. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  52. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  53. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  54. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  55. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  56. gllm_docproc/loader/__init__.pyi +4 -0
  57. gllm_docproc/loader/audio/__init__.pyi +3 -0
  58. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  59. gllm_docproc/loader/base_loader.pyi +30 -0
  60. gllm_docproc/loader/csv/__init__.pyi +3 -0
  61. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  62. gllm_docproc/loader/docx/__init__.pyi +5 -0
  63. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  64. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  65. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  66. gllm_docproc/loader/exception/__init__.pyi +4 -0
  67. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  68. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  69. gllm_docproc/loader/html/__init__.pyi +5 -0
  70. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  71. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  72. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  73. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
  74. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  75. gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
  76. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  77. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  78. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  79. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  80. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  81. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  82. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  83. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  84. gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
  85. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  86. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  87. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  88. gllm_docproc/loader/image/__init__.pyi +3 -0
  89. gllm_docproc/loader/image/image_loader.pyi +54 -0
  90. gllm_docproc/loader/json/__init__.pyi +3 -0
  91. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  92. gllm_docproc/loader/loader_utils.pyi +43 -0
  93. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  94. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  96. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  97. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  98. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  99. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  100. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  101. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  102. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  103. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  104. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  105. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  106. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  107. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  108. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  109. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  110. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  111. gllm_docproc/loader/txt/__init__.pyi +3 -0
  112. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  113. gllm_docproc/loader/video/__init__.pyi +3 -0
  114. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  115. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  116. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  117. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  118. gllm_docproc/model/__init__.pyi +7 -0
  119. gllm_docproc/model/element.pyi +38 -0
  120. gllm_docproc/model/element_metadata.pyi +35 -0
  121. gllm_docproc/model/loader_type.pyi +20 -0
  122. gllm_docproc/model/media.pyi +51 -0
  123. gllm_docproc/model/parser_type.pyi +19 -0
  124. gllm_docproc/parser/__init__.pyi +4 -0
  125. gllm_docproc/parser/base_parser.pyi +28 -0
  126. gllm_docproc/parser/document/__init__.pyi +7 -0
  127. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  128. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  129. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  130. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  131. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  132. gllm_docproc/parser/html/__init__.pyi +4 -0
  133. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  134. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  135. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  136. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  137. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  138. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  139. gllm_docproc/parser/image/__init__.pyi +4 -0
  140. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  141. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  142. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  143. gllm_docproc/parser/table/__init__.pyi +3 -0
  144. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  145. gllm_docproc/request_handler/__init__.pyi +3 -0
  146. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  147. gllm_docproc/response_handler/__init__.pyi +3 -0
  148. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  149. gllm_docproc/utils/__init__.pyi +3 -0
  150. gllm_docproc/utils/async_utils.pyi +22 -0
  151. gllm_docproc/utils/file_utils.pyi +76 -0
  152. gllm_docproc/utils/html_constants.pyi +122 -0
  153. gllm_docproc/validator/__init__.pyi +6 -0
  154. gllm_docproc/validator/base_validator.pyi +34 -0
  155. gllm_docproc/validator/character_count_validator.pyi +26 -0
  156. gllm_docproc/validator/file_size_validator.pyi +20 -0
  157. gllm_docproc/validator/model/__init__.pyi +4 -0
  158. gllm_docproc/validator/model/validator_input.pyi +50 -0
  159. gllm_docproc/validator/model/validator_result.pyi +19 -0
  160. gllm_docproc/validator/page_count_validator.pyi +23 -0
  161. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  162. gllm_docproc.build/.gitignore +1 -0
  163. gllm_docproc.cpython-311-darwin.so +0 -0
  164. gllm_docproc.pyi +222 -0
  165. gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
  166. gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
  167. gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
  168. gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseDPORouter(ABC):
5
+ """Base class for routing in document processing."""
6
+ @abstractmethod
7
+ def route(self, *args: Any, **kwargs: Any) -> Any:
8
+ """Routes the input into different processing pipelines based on certain criteria.
9
+
10
+ Args:
11
+ *args (Any): Variable length argument list for routing parameters.
12
+ **kwargs (Any): Arbitrary keyword arguments for additional routing configuration.
13
+
14
+ Returns:
15
+ Any: The result of the routing process.
16
+ """
@@ -0,0 +1,52 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.dpo_router.base_dpo_router import BaseDPORouter as BaseDPORouter
3
+ from gllm_docproc.loader.csv.pandas_loader import CSV_VARIANTS as CSV_VARIANTS
4
+ from gllm_docproc.loader.image import ImageLoader as ImageLoader
5
+ from gllm_docproc.loader.json.json_elements_loader import JSON as JSON
6
+ from gllm_docproc.loader.txt import TXTLoader as TXTLoader
7
+ from gllm_docproc.loader.video.video_loader_utils import is_supported_video_file as is_supported_video_file
8
+ from gllm_docproc.model import Element as Element, LoaderType as LoaderType
9
+ from gllm_docproc.model.element_metadata import DOCX as DOCX, HTML as HTML, PDF as PDF, PPTX as PPTX, XLSX as XLSX
10
+ from typing import Any
11
+
12
+ class LoaderRouter(BaseDPORouter):
13
+ """Loader Router class.
14
+
15
+ This router determines the appropriate loader type based on the input source.
16
+ Returns a dict with the loader type information.
17
+ """
18
+ logger: Incomplete
19
+ txt_loader: Incomplete
20
+ image_loader: Incomplete
21
+ def __init__(self) -> None:
22
+ """Initialize the LoaderRouter.
23
+
24
+ This method initializes the LoaderRouter.
25
+ """
26
+ def route(self, source: str, *args: Any, **kwargs: Any) -> dict[str, Any]:
27
+ """Route the input source to the appropriate loader type.
28
+
29
+ This method determines the appropriate loader type based on the input source.
30
+ It checks if the source is a file or a YouTube URL.
31
+ 1. If it is a file, it checks the file extension or content to determine the loader type.
32
+ 2. If it is a YouTube URL, it returns the audio loader type.
33
+ 3. If it is not a file or a YouTube URL, it returns the uncategorized loader type.
34
+
35
+ Args:
36
+ source (str): The input source, either a file path or a YouTube URL.
37
+ *args (Any): Additional arguments.
38
+ **kwargs (Any): Additional keyword arguments.
39
+
40
+ Returns:
41
+ dict[str, Any]: A dictionary containing the loader type information.
42
+ Example: {LoaderType.KEY: LoaderType.PDF_LOADER}
43
+ """
44
+ def is_html_from_json(self, source: str) -> bool:
45
+ """Check if the source file contains valid HTML metadata.
46
+
47
+ Args:
48
+ source (str): The file path to check.
49
+
50
+ Returns:
51
+ bool: True if the file is a valid HTML metadata file, False otherwise.
52
+ """
@@ -0,0 +1,42 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.dpo_router.base_dpo_router import BaseDPORouter as BaseDPORouter
3
+ from gllm_docproc.model.element import Element as Element
4
+ from gllm_docproc.model.element_metadata import AUDIO as AUDIO, CSV as CSV, DOCX as DOCX, HTML as HTML, IMAGE as IMAGE, PDF as PDF, PPTX as PPTX, TXT as TXT, VIDEO as VIDEO, XLSX as XLSX
5
+ from gllm_docproc.model.parser_type import ParserType as ParserType
6
+ from typing import Any
7
+
8
+ class ParserRouter(BaseDPORouter):
9
+ """Parser Router class.
10
+
11
+ This router determines the appropriate parser type based on the input source.
12
+ Returns a dict with the parser type information.
13
+ """
14
+ logger: Incomplete
15
+ source_type_to_parser_type: Incomplete
16
+ def __init__(self) -> None:
17
+ """Initialize the ParserRouter.
18
+
19
+ This method initializes the ParserRouter.
20
+ """
21
+ def route(self, source: str | list[dict[str, Any]], *args, **kwargs) -> dict[str, Any]:
22
+ """Determine the parser type from the input source.
23
+
24
+ The input source can be:
25
+ - A string path to a JSON file containing loaded elements.
26
+ - A list of loaded element dictionaries (in memory).
27
+
28
+ This method reads the input, extracts the `source_type` from the first element’s metadata,
29
+ and returns the appropriate parser type. If loading fails or metadata is missing,
30
+ the parser type will be set as UNCATEGORIZED.
31
+
32
+ Args:
33
+ source (str | list[dict[str, Any]]): The input source, which can be either:
34
+ - str: path to a JSON file containing loaded elements.
35
+ - list[dict[str, Any]]: loaded elements.
36
+ *args (Any): Additional arguments.
37
+ **kwargs (Any): Additional keyword arguments.
38
+
39
+ Returns:
40
+ dict[str, Any]: A dictionary containing the parser type information.
41
+ Example: {ParserType.KEY: ParserType.PDF_PARSER}
42
+ """
@@ -0,0 +1,3 @@
1
+ from .base_housekeeping import BaseHouseKeeping as BaseHouseKeeping
2
+
3
+ __all__ = ['BaseHouseKeeping']
@@ -0,0 +1,14 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ class BaseHouseKeeping(ABC):
4
+ """Base class for document converter."""
5
+ @abstractmethod
6
+ def housekeeping(self, folder_path: str) -> None:
7
+ """Placeholder method for performing housekeeping tasks on a specified folder.
8
+
9
+ Args:
10
+ folder_path (str): The path to the folder to perform housekeeping on.
11
+
12
+ Returns:
13
+ None
14
+ """
@@ -0,0 +1,3 @@
1
+ from gllm_docproc.indexer.base_indexer import BaseIndexer as BaseIndexer
2
+
3
+ __all__ = ['BaseIndexer']
@@ -0,0 +1,30 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseIndexer(ABC):
5
+ """Base class for document converter."""
6
+ @abstractmethod
7
+ def index(self, elements: Any, **kwargs: Any) -> Any:
8
+ """Index data from a source file into Elasticsearch.
9
+
10
+ Args:
11
+ elements (Any): The information to be indexed. Ideally formatted as List[Dict] and
12
+ each Dict following the structure of model 'Element'.
13
+ **kwargs (Any): Additional keyword arguments for customization.
14
+
15
+ Returns:
16
+ Any: The response from the indexing process.
17
+ """
18
+ @abstractmethod
19
+ def delete(self, **kwargs: Any) -> Any:
20
+ """Delete document from a vector DB.
21
+
22
+ The arguments are not defined yet, it depends on the implementation.
23
+ Some vector database will require: db_url, index_name, document_id.
24
+
25
+ Args:
26
+ **kwargs (Any): Additional keyword arguments for customization.
27
+
28
+ Returns:
29
+ Any: The response from the deletion process.
30
+ """
@@ -0,0 +1,4 @@
1
+ from gllm_docproc.indexer.graph.light_rag_graph_rag_indexer import LightRAGGraphRAGIndexer as LightRAGGraphRAGIndexer
2
+ from gllm_docproc.indexer.graph.llama_index_graph_rag_indexer import LlamaIndexGraphRAGIndexer as LlamaIndexGraphRAGIndexer
3
+
4
+ __all__ = ['LlamaIndexGraphRAGIndexer', 'LightRAGGraphRAGIndexer']
@@ -0,0 +1,11 @@
1
+ from abc import ABC, abstractmethod
2
+ from gllm_docproc.indexer import BaseIndexer as BaseIndexer
3
+
4
+ class BaseGraphRAGIndexer(BaseIndexer, ABC):
5
+ """Abstract base class for Graph RAG Indexer.
6
+
7
+ This class defines the interface for a Graph RAG Indexer.
8
+ """
9
+ @abstractmethod
10
+ def resolve_entities(self) -> None:
11
+ """Resolve entities in the graph."""
@@ -0,0 +1,97 @@
1
+ from gllm_datastore.graph_data_store.light_rag_data_store import BaseLightRAGDataStore
2
+ from gllm_docproc.indexer.graph.graph_rag_indexer import BaseGraphRAGIndexer as BaseGraphRAGIndexer
3
+ from gllm_docproc.model.element import Element as Element
4
+ from typing import Any
5
+
6
+ class LightRAGGraphRAGIndexer(BaseGraphRAGIndexer):
7
+ '''Indexer abstract base class for LightRAG-based graph RAG.
8
+
9
+ How to run LightRAG with PostgreSQL using Docker:
10
+ ```bash
11
+ docker run -p 5455:5432 -d --name postgres-LightRag shangor/postgres-for-rag:v1.0 sh -c "service postgresql start && sleep infinity"
12
+ ```
13
+
14
+ Example:
15
+ ```python
16
+ from gllm_inference.em_invoker import OpenAIEMInvoker
17
+ from gllm_inference.lm_invoker import OpenAILMInvoker
18
+ from gllm_docproc.indexer.graph.light_rag_graph_rag_indexer import LightRAGGraphRAGIndexer
19
+ from gllm_datastore.graph_data_store.light_rag_postgres_data_store import LightRAGPostgresDataStore
20
+
21
+ # Create the LightRAGPostgresDataStore instance
22
+ graph_store = LightRAGPostgresDataStore(
23
+ lm_invoker=OpenAILMInvoker(model_name="gpt-4o-mini"),
24
+ em_invoker=OpenAIEMInvoker(model_name="text-embedding-3-small"),
25
+ postgres_db_host="localhost",
26
+ postgres_db_port=5455,
27
+ postgres_db_user="rag",
28
+ postgres_db_password="rag",
29
+ postgres_db_name="rag",
30
+ postgres_db_workspace="default",
31
+ )
32
+
33
+
34
+ # Create the indexer
35
+ indexer = LightRAGGraphRAGIndexer(graph_store=graph_store)
36
+
37
+ # Create elements to index
38
+ elements = [
39
+ {
40
+ "text": "This is a sample document about AI.",
41
+ "structure": "uncategorized",
42
+ "metadata": {
43
+ "source": "sample.txt",
44
+ "source_type": "TEXT",
45
+ "loaded_datetime": "2025-07-10T12:00:00",
46
+ "chunk_id": "chunk_001",
47
+ "file_id": "file_001"
48
+ }
49
+ }
50
+ ]
51
+
52
+ # Index the elements
53
+ indexer.index(elements)
54
+ ```
55
+
56
+ Attributes:
57
+ _graph_store (BaseLightRAGDataStore): The LightRAG data store used for indexing and querying.
58
+ '''
59
+ def __init__(self, graph_store: BaseLightRAGDataStore) -> None:
60
+ """Initialize the LightRAGGraphRAGIndexer.
61
+
62
+ Args:
63
+ graph_store (BaseLightRAGDataStore): The LightRAG instance to use for indexing.
64
+ """
65
+ def index(self, elements: list[dict[str, Any]], **kwargs: Any) -> None:
66
+ """Index elements into the LightRAG system and create graph relationships.
67
+
68
+ This method extracts text and chunk IDs from the provided elements,
69
+ inserts them into the LightRAG system, and creates a graph structure
70
+ connecting files to chunks.
71
+
72
+ Args:
73
+ elements (list[dict[str, Any]]): List of Element objects containing text and metadata.
74
+ Each element should have a metadata attribute with a chunk_id and a file_id.
75
+ **kwargs (Any): Additional keyword arguments.
76
+ """
77
+ def delete(self, file_id: str | None = None, chunk_id: str | None = None, entity_id: str | None = None, **kwargs: Any) -> None:
78
+ """Delete entities from the LightRAG system and graph.
79
+
80
+ Supports multiple deletion modes based on the provided keyword arguments.
81
+ Exactly one of the supported deletion parameters must be provided.
82
+
83
+ Args:
84
+ file_id (str, optional): Delete a file and all its associated chunks. Defaults to None.
85
+ chunk_id (str, optional): Delete a specific chunk entity. Defaults to None.
86
+ entity_id (str, optional): Delete a specific entity or node. Defaults to None.
87
+ **kwargs (Any): Additional keyword arguments.
88
+
89
+ Raises:
90
+ ValueError: If no deletion parameter is provided or multiple are provided.
91
+ """
92
+ def resolve_entities(self) -> None:
93
+ """Resolve entities from the graph.
94
+
95
+ Currently, this method does nothing. Resolve entities
96
+ has been implicitly implemented in the LightRAG instance.
97
+ """
@@ -0,0 +1,79 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_datastore.graph_data_store.llama_index_graph_rag_data_store import LlamaIndexGraphRAGDataStore
3
+ from gllm_docproc.indexer.graph.graph_rag_indexer import BaseGraphRAGIndexer as BaseGraphRAGIndexer
4
+ from gllm_docproc.indexer.graph.utils.schema_validator import validate_kg_schema as validate_kg_schema
5
+ from gllm_docproc.model.element import Element as Element
6
+ from llama_index.core.base.embeddings.base import BaseEmbedding
7
+ from llama_index.core.base.llms.base import BaseLLM
8
+ from llama_index.core.schema import TransformComponent as TransformComponent
9
+ from llama_index.core.vector_stores.types import BasePydanticVectorStore
10
+ from typing import Any
11
+
12
+ logger: Incomplete
13
+
14
+ class LlamaIndexGraphRAGIndexer(BaseGraphRAGIndexer):
15
+ """Indexer for graph RAG using LlamaIndex.
16
+
17
+ Attributes:
18
+ _index (PropertyGraphIndex): Property graph index.
19
+ _graph_store (LlamaIndexGraphRAGDataStore): Storage for property graph.
20
+ _strict_mode (bool): Whether strict schema validation is enabled.
21
+ """
22
+ def __init__(self, graph_store: LlamaIndexGraphRAGDataStore, llama_index_llm: BaseLLM | None = None, allowed_entity_types: list[str] | None = None, allowed_relation_types: list[str] | None = None, kg_validation_schema: dict[str, list[str]] | None = None, strict_mode: bool = False, kg_extractors: list[TransformComponent] | None = None, embed_model: BaseEmbedding | None = None, vector_store: BasePydanticVectorStore | None = None, max_triplets_per_chunk: int = 10, num_workers: int = 4, **kwargs: Any) -> None:
23
+ '''Initialize the LlamaIndexGraphRAGIndexer.
24
+
25
+ Args:
26
+ graph_store (LlamaIndexGraphRAGDataStore): Storage for property graph.
27
+ llama_index_llm (BaseLLM | None, optional): Language model for LlamaIndex. Defaults to None.
28
+ allowed_entity_types (list[str] | None, optional): List of allowed entity types. When strict_mode=True,
29
+ only these types are extracted. When strict_mode=False, serves as hints. Defaults to None.
30
+ allowed_relation_types (list[str] | None, optional): List of allowed relationship types. Behavior depends
31
+ on strict_mode. Defaults to None.
32
+ kg_validation_schema (dict[str, list[str]] | None, optional): Validation schema for
33
+ strict mode. Maps entity types to their allowed outgoing relationship types.
34
+ Format:
35
+ {"ENTITY_TYPE": ["ALLOWED_REL1", "ALLOWED_REL2"], ...}
36
+ Example: {"PERSON": ["WORKS_AT", "FOUNDED"], "ORGANIZATION": ["LOCATED_IN"]}
37
+ Defaults to None.
38
+ strict_mode (bool, optional): If True, uses SchemaLLMPathExtractor with strict validation.
39
+ If False (default), uses DynamicLLMPathExtractor with optional guidance. Defaults to False.
40
+ kg_extractors (list[TransformComponent] | None, optional): Custom list of extractors.
41
+ If provided, overrides automatic extractor selection based on strict_mode. Defaults to None.
42
+ embed_model (BaseEmbedding | None, optional): Embedding model for vector representations. Defaults to None.
43
+ vector_store (BasePydanticVectorStore | None, optional): Storage for vector data. Defaults to None.
44
+ max_triplets_per_chunk (int, optional): Maximum triplets to extract per chunk. Defaults to 10.
45
+ num_workers (int, optional): Number of parallel workers. Defaults to 4.
46
+ **kwargs (Any): Additional keyword arguments.
47
+ '''
48
+ def index(self, elements: list[Element] | list[dict[str, Any]], **kwargs: Any) -> None:
49
+ """Index elements into the graph.
50
+
51
+ This method indexes elements into the graph.
52
+
53
+ Notes:
54
+ - Currently only Neo4jPropertyGraphStore that is supported for indexing the metadata from the TextNode.
55
+ - The 'document_id' parameter is used to specify the document ID for the elements.
56
+ - The 'chunk_id' parameter is used to specify the chunk ID for the elements.
57
+
58
+ Args:
59
+ elements (list[Element] | list[dict[str, Any]]): List of elements or list of dictionaries representing
60
+ elements to be indexed.
61
+ **kwargs (Any): Additional keyword arguments.
62
+ """
63
+ def resolve_entities(self) -> None:
64
+ """Resolve entities in the graph.
65
+
66
+ Currently, this method does nothing.
67
+ """
68
+ def delete(self, **kwargs: Any) -> None:
69
+ """Delete elements from the knowledge graph.
70
+
71
+ This method deletes elements from the knowledge graph based on the provided document_id.
72
+
73
+ Args:
74
+ **kwargs (Any): Additional keyword arguments.
75
+
76
+ Raises:
77
+ ValueError: If document_id is not provided.
78
+ Exception: If an error occurs during deletion.
79
+ """
@@ -0,0 +1,3 @@
1
+ from .vector_db_indexer import VectorDBIndexer as VectorDBIndexer
2
+
3
+ __all__ = ['VectorDBIndexer']
@@ -0,0 +1,53 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_datastore.core.capabilities import VectorCapability
3
+ from gllm_docproc.indexer import BaseIndexer as BaseIndexer
4
+ from gllm_docproc.model.element import Element as Element
5
+ from gllm_inference.schema import Vector as Vector
6
+ from typing import Any, TypeVar
7
+
8
+ T = TypeVar('T')
9
+
10
+ class VectorDBIndexer(BaseIndexer):
11
+ """Index elements into a vector datastore capability."""
12
+ logger: Incomplete
13
+ vector_capability: Incomplete
14
+ def __init__(self, vector_capability: VectorCapability) -> None:
15
+ """Initialize the indexer with an optional vector capability instance.
16
+
17
+ Args:
18
+ vector_capability (VectorCapability): The capability implementation
19
+ (for example, `ElasticsearchVectorCapability`) that will receive
20
+ chunks for indexing operations. Must be set before calling
21
+ indexing methods.
22
+ """
23
+ def index(self, elements: list[dict[str, Any]], **kwargs: Any) -> None:
24
+ """Index elements into the configured vector capability.
25
+
26
+ Args:
27
+ elements (list[dict[str, Any]]): Parsed elements containing text and metadata.
28
+ **kwargs (Any): Additional keyword arguments for customization.
29
+
30
+ Kwargs:
31
+ replace_file_id (str, optional): File identifier to be replaced before indexing.
32
+ Defaults to None. If provided, existing records for this file_id are removed first.
33
+ batch_size (int, optional): The number of chunks to process in each batch.
34
+ Defaults to 100.
35
+ max_retries (int, optional): The maximum number of retry attempts for failed batches.
36
+ Defaults to 3.
37
+ vectors (list[Vector] | None, optional): Pre-computed vectors for the elements.
38
+ If provided, uses create_from_vector instead of create. Must match the length
39
+ of elements. Defaults to None.
40
+
41
+ Raises:
42
+ Exception: If an error occurs during indexing.
43
+ """
44
+ def delete(self, **kwargs: Any) -> None:
45
+ """Delete documents from the vector capability based on the file ID.
46
+
47
+ Kwargs:
48
+ file_id (str): The ID of the file(s) to be deleted.
49
+
50
+ Raises:
51
+ ValueError: If file_id is not provided.
52
+ Exception: If an error occurs during deletion.
53
+ """
@@ -0,0 +1,4 @@
1
+ from .base_loader import BaseLoader as BaseLoader
2
+ from .pipeline_loader import PipelineLoader as PipelineLoader
3
+
4
+ __all__ = ['BaseLoader', 'PipelineLoader']
@@ -0,0 +1,3 @@
1
+ from .audio_loader import AudioLoader as AudioLoader
2
+
3
+ __all__ = ['AudioLoader']
@@ -0,0 +1,45 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata
4
+ from gllm_docproc.model.element import Element as Element, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import AUDIO as AUDIO, ElementMetadata as ElementMetadata
6
+ from gllm_multimodal.modality_converter.audio_to_text.audio_to_text import BaseAudioToText as BaseAudioToText
7
+ from gllm_multimodal.modality_converter.schema import AudioTranscript as AudioTranscript
8
+ from typing import Any
9
+
10
+ class AudioLoader(BaseLoader):
11
+ """Audio Loader class.
12
+
13
+ This class provides a loader for audio files for extracting information from audio files
14
+ using GLLM Multimodal Audio to Text. It implements the 'load' method to handle document loading
15
+ from a given source.
16
+
17
+ Attributes:
18
+ audio_to_text_converters (list[BaseAudioToText]): A list of audio to text converter from GLLM Multimodal.
19
+ """
20
+ audio_to_text_converters: Incomplete
21
+ logger: Incomplete
22
+ def __init__(self, audio_to_text_converters: list[BaseAudioToText] | None = None) -> None:
23
+ """Initialize the AudioLoader class.
24
+
25
+ Args:
26
+ audio_to_text_converters (list[BaseAudioToText] | None): A list of audio to text converters.
27
+ If None, defaults to using YouTubeTranscriptAudioToText and OpenAIWhisperAudioToText.
28
+ """
29
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> list[dict[str, Any]]:
30
+ """Load and process audio file using the GLLM Multimodal Audio to Text.
31
+
32
+ This method will transcribe the audio file using the GLLM Multimodal Audio to Text. It will then convert the
33
+ transcription results to elements.
34
+
35
+ Args:
36
+ source (str): The source of the audio file to be transcribed.
37
+ loaded_elements (Any): The loaded elements to be processed.
38
+ **kwargs (Any): Additional keyword arguments for the loader
39
+
40
+ Kwargs:
41
+ original_source (str, optional): The original source of the document.
42
+
43
+ Returns:
44
+ list[dict[str, Any]]: The loaded elements.
45
+ """
@@ -0,0 +1,30 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class BaseLoader(ABC):
5
+ """An abstract base class for document loaders.
6
+
7
+ This class defines the structure for loading and processing documents to retrieve
8
+ required values. Subclasses are expected to implement the 'load' method
9
+ to handle document loading from a given source.
10
+
11
+ Methods:
12
+ load(source, loaded_elements, **kwargs): Abstract method to load a document.
13
+ """
14
+ @abstractmethod
15
+ def load(self, source: str, loaded_elements: Any = None, **kwargs: Any) -> Any:
16
+ """Load and process a document.
17
+
18
+ This method is abstract and must be implemented in subclasses.
19
+ It defines the process of loading a document using its source.
20
+
21
+ Args:
22
+ source (str): Might be file path, URL, the content itself.
23
+ loaded_elements (Any): The loaded elements from previous loaders. ideally formatted as List[Dict].
24
+ **kwargs (Any): Additional keyword arguments for customization.
25
+
26
+ Returns:
27
+ Any: The loaded document, ideally formatted as List[Dict]. Each dictionary within
28
+ the list are recommended to follows the structure of model 'Element',
29
+ to ensure consistency and ease of use across Document Processing Orchestrator.
30
+ """
@@ -0,0 +1,3 @@
1
+ from .pandas_loader import PandasLoader as PandasLoader
2
+
3
+ __all__ = ['PandasLoader']
@@ -0,0 +1,53 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.model.element import Element as Element, TABLE as TABLE
5
+ from gllm_docproc.model.element_metadata import CSV as CSV, ElementMetadata as ElementMetadata
6
+ from typing import Any
7
+
8
+ CSV_VARIANTS: Incomplete
9
+
10
+ class PandasLoader(BaseLoader):
11
+ """A class used to load and process delimited text files using the pandas library.
12
+
13
+ This class inherits from the BaseLoader class and overrides its methods to provide
14
+ functionality for loading delimited text files with different separators:
15
+ - CSV (Comma-Separated Values): Uses commas (,) as separators between values
16
+ - TSV (Tab-Separated Values): Uses tabs (\\t) as separators between values
17
+ - PSV (Pipe-Separated Values): Uses pipe characters (|) as separators between values
18
+ - SSV (Space-Separated Values): Uses spaces as separators between values
19
+
20
+ It provides methods to extract tables from the document and convert them into elements.
21
+
22
+ Methods:
23
+ load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any)
24
+ -> list[dict[str, Any]]: Load a delimited text file.
25
+ """
26
+ logger: Incomplete
27
+ def __init__(self) -> None:
28
+ """Initialize the PandasLoader class."""
29
+ def load(self, source: str, loaded_elements: list[dict[str, Any]] | None = None, **kwargs: Any) -> list[dict[str, Any]]:
30
+ """Load a delimited text file.
31
+
32
+ This method loads a delimited text file (CSV, TSV, PSV, SSV) and extracts the table elements.
33
+ The method takes the source file path as input and returns a list of Element objects
34
+ representing the tables in the document.
35
+
36
+ Args:
37
+ source (str): The path to the delimited text file to load.
38
+ loaded_elements (list[dict[str, Any]]): The loaded elements from previous loaders.
39
+ **kwargs (Any): Additional keyword arguments to pass to the loader.
40
+
41
+ Kwargs:
42
+ original_source (str, optional): The original source of the document.
43
+ sep (str, optional): Delimiter to use. If not provided, will be auto-detected.
44
+ encoding (str, optional): Encoding to use. Defaults to 'utf-8'.
45
+ header (int | None, optional): Row number (0-based index) to use as column names.
46
+ If None, no header row is assumed and numeric column names are generated.
47
+
48
+ Returns:
49
+ list[dict[str, Any]]: The loaded elements from the delimited text file.
50
+
51
+ Raises:
52
+ UnsupportedFileExtensionError: If the file extension is not supported.
53
+ """
@@ -0,0 +1,5 @@
1
+ from .docx2python_loader import DOCX2PythonLoader as DOCX2PythonLoader
2
+ from .python_docx_loader import PythonDOCXLoader as PythonDOCXLoader
3
+ from .python_docx_table_loader import PythonDOCXTableLoader as PythonDOCXTableLoader
4
+
5
+ __all__ = ['PythonDOCXLoader', 'PythonDOCXTableLoader', 'DOCX2PythonLoader']
@@ -0,0 +1,46 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_docproc.loader.base_loader import BaseLoader as BaseLoader
3
+ from gllm_docproc.loader.loader_utils import create_base_element_metadata as create_base_element_metadata, trim_table_empty_cells as trim_table_empty_cells, validate_file_extension as validate_file_extension
4
+ from gllm_docproc.model.element import Element as Element, FOOTER as FOOTER, HEADER as HEADER, IMAGE as IMAGE, TABLE as TABLE, UNCATEGORIZED_TEXT as UNCATEGORIZED_TEXT
5
+ from gllm_docproc.model.element_metadata import DOCX as DOCX, ElementMetadata as ElementMetadata
6
+ from gllm_docproc.model.media import Media as Media, MediaSourceType as MediaSourceType, MediaType as MediaType
7
+ from typing import Any
8
+
9
+ class DOCX2PythonLoader(BaseLoader):
10
+ """A class for loading and processing DOCX document using docx2python library.
11
+
12
+ This class defines the structure for loading and processing DOCX document to retrieve required values
13
+ (text, table, image, header, footer footnote, endnote). It implements the 'load' method to handle DOCX loading
14
+ from a given file path.
15
+
16
+ DOCX2PythonLoader is used to extract the text, table, image, header, footer, footnote, endnote
17
+ from the DOCX document.
18
+
19
+ Methods:
20
+ load(source, loaded_elements, **kwargs): Load a DOCX document.
21
+ """
22
+ duplicate_merged_cells: Incomplete
23
+ def __init__(self, duplicate_merged_cells: bool = True) -> None:
24
+ """Initialize the DOCX2PythonLoader.
25
+
26
+ Args:
27
+ duplicate_merged_cells (bool): A boolean value indicating whether to duplicate merged cells.
28
+ """
29
+ def load(self, source: str, loaded_elements: list[dict[str, str]] | None = None, **kwargs: Any) -> list[dict[str, str]]:
30
+ """Load and process a DOCX document specified by the file path and name (source).
31
+
32
+ This method defines the process of loading a DOCX document using its file path.
33
+ It extracts the text, table, image, header, footer, footnote, endnote from the DOCX document.
34
+
35
+ Args:
36
+ source (str): The file path of the DOCX document.
37
+ loaded_elements (list[dict[str, str]] | None): A list of loaded elements containing text content
38
+ and metadata.
39
+ **kwargs (Any): Additional keyword arguments for loading the DOCX document.
40
+
41
+ Kwargs:
42
+ original_source (str, optional): The original source of the document.
43
+
44
+ Returns:
45
+ list[dict[str, str]]: A list of dictionaries containing loaded content and metadata.
46
+ """