gllm-docproc-binary 0.7.26__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-docproc-binary might be problematic. Click here for more details.

Files changed (168) hide show
  1. gllm_docproc/__init__.pyi +0 -0
  2. gllm_docproc/chunker/__init__.pyi +3 -0
  3. gllm_docproc/chunker/base_chunker.pyi +28 -0
  4. gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
  5. gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
  6. gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
  7. gllm_docproc/chunker/table/__init__.pyi +3 -0
  8. gllm_docproc/chunker/table/table_chunker.pyi +45 -0
  9. gllm_docproc/converter/__init__.pyi +3 -0
  10. gllm_docproc/converter/base_converter.pyi +15 -0
  11. gllm_docproc/data_generator/__init__.pyi +5 -0
  12. gllm_docproc/data_generator/base_data_generator.pyi +18 -0
  13. gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
  14. gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
  15. gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
  16. gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
  17. gllm_docproc/downloader/__init__.pyi +5 -0
  18. gllm_docproc/downloader/base_downloader.pyi +19 -0
  19. gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
  20. gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
  21. gllm_docproc/downloader/html/__init__.pyi +7 -0
  22. gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
  23. gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
  24. gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
  25. gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
  26. gllm_docproc/downloader/html/html_downloader.pyi +114 -0
  27. gllm_docproc/downloader/html/playwright_downloader.pyi +60 -0
  28. gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
  29. gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
  30. gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
  31. gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
  32. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
  33. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
  34. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
  35. gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
  36. gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
  37. gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
  38. gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
  39. gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
  40. gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
  41. gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
  42. gllm_docproc/dpo_router/__init__.pyi +5 -0
  43. gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
  44. gllm_docproc/dpo_router/loader_router.pyi +52 -0
  45. gllm_docproc/dpo_router/parser_router.pyi +42 -0
  46. gllm_docproc/housekeeping/__init__.pyi +3 -0
  47. gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
  48. gllm_docproc/indexer/__init__.pyi +3 -0
  49. gllm_docproc/indexer/base_indexer.pyi +30 -0
  50. gllm_docproc/indexer/graph/__init__.pyi +4 -0
  51. gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
  52. gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
  53. gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
  54. gllm_docproc/indexer/vector/__init__.pyi +3 -0
  55. gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
  56. gllm_docproc/loader/__init__.pyi +4 -0
  57. gllm_docproc/loader/audio/__init__.pyi +3 -0
  58. gllm_docproc/loader/audio/audio_loader.pyi +45 -0
  59. gllm_docproc/loader/base_loader.pyi +30 -0
  60. gllm_docproc/loader/csv/__init__.pyi +3 -0
  61. gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
  62. gllm_docproc/loader/docx/__init__.pyi +5 -0
  63. gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
  64. gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
  65. gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
  66. gllm_docproc/loader/exception/__init__.pyi +4 -0
  67. gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
  68. gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
  69. gllm_docproc/loader/html/__init__.pyi +5 -0
  70. gllm_docproc/loader/html/exception/__init__.pyi +3 -0
  71. gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
  72. gllm_docproc/loader/html/flat/__init__.pyi +3 -0
  73. gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +66 -0
  74. gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
  75. gllm_docproc/loader/html/flat/html_flat_merger.pyi +23 -0
  76. gllm_docproc/loader/html/html_base_loader.pyi +25 -0
  77. gllm_docproc/loader/html/nested/__init__.pyi +3 -0
  78. gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
  79. gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
  80. gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
  81. gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
  82. gllm_docproc/loader/html/utils/__init__.pyi +0 -0
  83. gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
  84. gllm_docproc/loader/html/utils/html_utils.pyi +59 -0
  85. gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
  86. gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
  87. gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
  88. gllm_docproc/loader/image/__init__.pyi +3 -0
  89. gllm_docproc/loader/image/image_loader.pyi +54 -0
  90. gllm_docproc/loader/json/__init__.pyi +3 -0
  91. gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
  92. gllm_docproc/loader/loader_utils.pyi +43 -0
  93. gllm_docproc/loader/pdf/__init__.pyi +14 -0
  94. gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
  95. gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
  96. gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
  97. gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
  98. gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
  99. gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
  100. gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
  101. gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
  102. gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
  103. gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
  104. gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
  105. gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
  106. gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
  107. gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
  108. gllm_docproc/loader/pipeline_loader.pyi +48 -0
  109. gllm_docproc/loader/pptx/__init__.pyi +3 -0
  110. gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
  111. gllm_docproc/loader/txt/__init__.pyi +3 -0
  112. gllm_docproc/loader/txt/txt_loader.pyi +55 -0
  113. gllm_docproc/loader/video/__init__.pyi +3 -0
  114. gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
  115. gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
  116. gllm_docproc/loader/xlsx/__init__.pyi +3 -0
  117. gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
  118. gllm_docproc/model/__init__.pyi +7 -0
  119. gllm_docproc/model/element.pyi +38 -0
  120. gllm_docproc/model/element_metadata.pyi +35 -0
  121. gllm_docproc/model/loader_type.pyi +20 -0
  122. gllm_docproc/model/media.pyi +51 -0
  123. gllm_docproc/model/parser_type.pyi +19 -0
  124. gllm_docproc/parser/__init__.pyi +4 -0
  125. gllm_docproc/parser/base_parser.pyi +28 -0
  126. gllm_docproc/parser/document/__init__.pyi +7 -0
  127. gllm_docproc/parser/document/docx_parser.pyi +27 -0
  128. gllm_docproc/parser/document/pdf_parser.pyi +35 -0
  129. gllm_docproc/parser/document/pptx_parser.pyi +34 -0
  130. gllm_docproc/parser/document/txt_parser.pyi +22 -0
  131. gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
  132. gllm_docproc/parser/html/__init__.pyi +4 -0
  133. gllm_docproc/parser/html/flat/__init__.pyi +0 -0
  134. gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
  135. gllm_docproc/parser/html/nested/__init__.pyi +0 -0
  136. gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
  137. gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
  138. gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
  139. gllm_docproc/parser/image/__init__.pyi +4 -0
  140. gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
  141. gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
  142. gllm_docproc/parser/pipeline_parser.pyi +33 -0
  143. gllm_docproc/parser/table/__init__.pyi +3 -0
  144. gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
  145. gllm_docproc/request_handler/__init__.pyi +3 -0
  146. gllm_docproc/request_handler/base_request_handler.pyi +16 -0
  147. gllm_docproc/response_handler/__init__.pyi +3 -0
  148. gllm_docproc/response_handler/base_response_handler.pyi +38 -0
  149. gllm_docproc/utils/__init__.pyi +3 -0
  150. gllm_docproc/utils/async_utils.pyi +22 -0
  151. gllm_docproc/utils/file_utils.pyi +76 -0
  152. gllm_docproc/utils/html_constants.pyi +122 -0
  153. gllm_docproc/validator/__init__.pyi +6 -0
  154. gllm_docproc/validator/base_validator.pyi +34 -0
  155. gllm_docproc/validator/character_count_validator.pyi +26 -0
  156. gllm_docproc/validator/file_size_validator.pyi +20 -0
  157. gllm_docproc/validator/model/__init__.pyi +4 -0
  158. gllm_docproc/validator/model/validator_input.pyi +50 -0
  159. gllm_docproc/validator/model/validator_result.pyi +19 -0
  160. gllm_docproc/validator/page_count_validator.pyi +23 -0
  161. gllm_docproc/validator/pipeline_validator.pyi +40 -0
  162. gllm_docproc.build/.gitignore +1 -0
  163. gllm_docproc.cpython-311-darwin.so +0 -0
  164. gllm_docproc.pyi +222 -0
  165. gllm_docproc_binary-0.7.26.dist-info/METADATA +216 -0
  166. gllm_docproc_binary-0.7.26.dist-info/RECORD +168 -0
  167. gllm_docproc_binary-0.7.26.dist-info/WHEEL +5 -0
  168. gllm_docproc_binary-0.7.26.dist-info/top_level.txt +1 -0
@@ -0,0 +1,40 @@
1
+ from gllm_docproc.validator.base_validator import BaseValidator as BaseValidator
2
+ from gllm_docproc.validator.model.validator_input import ValidatorInput as ValidatorInput
3
+ from gllm_docproc.validator.model.validator_result import ValidatorResult as ValidatorResult
4
+
5
+ class PipelineValidator:
6
+ """A pipeline for validating files against multiple validation rules.
7
+
8
+ This class provides a flexible way to validate files by chaining multiple `BaseValidator`
9
+ instances. Each validator is applied sequentially, and validation behavior depends on
10
+ the `stop_on_failure` setting of each validator.
11
+
12
+ Attributes:
13
+ validators (list[BaseValidator]): A list of `BaseValidator` instances to apply for file validation.
14
+ """
15
+ validators: list[BaseValidator]
16
+ def __init__(self) -> None:
17
+ """Initialize the PipelineValidator object."""
18
+ def add_validator(self, validator: BaseValidator) -> PipelineValidator:
19
+ """Add a validator to the validation pipeline.
20
+
21
+ Args:
22
+ validator (BaseValidator): The validator to add to the pipeline.
23
+
24
+ Returns:
25
+ PipelineValidator: The validation pipeline object for method chaining.
26
+ """
27
+ def validate(self, file_validation_input: ValidatorInput) -> list[ValidatorResult]:
28
+ """Validate the file against all configured validation rules.
29
+
30
+ Validation stops early if a validator fails and its `stop_on_failure`
31
+ setting is True; in that case, the returned list will only include results
32
+ up to and including the failing validator.
33
+
34
+ Args:
35
+ file_validation_input (ValidatorInput): The file validation input object to validate.
36
+
37
+ Returns:
38
+ list[ValidatorResult]: A list of ValidatorResult objects for each validator run,
39
+ which may be truncated if a validator with `stop_on_failure=True` fails.
40
+ """
@@ -0,0 +1 @@
1
+ *
Binary file
gllm_docproc.pyi ADDED
@@ -0,0 +1,222 @@
1
+ # This file was generated by Nuitka
2
+
3
+ # Stubs included by default
4
+
5
+
6
+ __name__ = ...
7
+
8
+
9
+
10
+ # Modules used internally, to allow implicit dependencies to be seen:
11
+ import os
12
+ import abc
13
+ import typing
14
+ import hashlib
15
+ import inspect
16
+ import re
17
+ import langchain_text_splitters
18
+ import gllm_docproc.chunker.table.TableChunker
19
+ import pandas
20
+ import asyncio
21
+ import concurrent
22
+ import concurrent.futures
23
+ import concurrent.futures.ThreadPoolExecutor
24
+ import gllm_multimodal
25
+ import gllm_multimodal.constants
26
+ import gllm_multimodal.modality_converter
27
+ import gllm_multimodal.modality_converter.image_to_text
28
+ import gllm_multimodal.modality_converter.image_to_text.image_to_caption
29
+ import gllm_multimodal.modality_converter.image_to_text.image_to_caption.image_to_caption
30
+ import json
31
+ import gllm_core
32
+ import gllm_core.utils
33
+ import gllm_core.utils.logger_manager
34
+ import gllm_core.utils.retry
35
+ import gllm_inference
36
+ import gllm_inference.builder
37
+ import gllm_inference.output_parser
38
+ import gllm_inference.prompt_builder
39
+ import gllm_multimodal.modality_converter.image_to_text.image_to_caption.preset_image_to_caption
40
+ import gllm_privacy
41
+ import gllm_privacy.pii_detector
42
+ import gllm_privacy.pii_detector.text_analyzer
43
+ import gllm_privacy.pii_detector.text_anonymizer
44
+ import langdetect
45
+ import mimetypes
46
+ import time
47
+ import uuid
48
+ import pathlib
49
+ import magic
50
+ import requests
51
+ import requests.adapters
52
+ import urllib3
53
+ import urllib3.util
54
+ import urllib3.util.retry
55
+ import gllm_docproc.downloader.BaseDownloader
56
+ import posixpath
57
+ import bosa_connectors
58
+ import bosa_connectors.connector
59
+ import bosa_connectors.models
60
+ import bosa_connectors.models.file
61
+ import datetime
62
+ import firecrawl
63
+ import pydantic
64
+ import copy
65
+ import html_to_markdown
66
+ import scrapy
67
+ import gllm_docproc.downloader.html.exception.ItemScrapeFailedException
68
+ import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlBaseSpider
69
+ import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapLinkSpider
70
+ import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapSpider
71
+ import gllm_docproc.downloader.html.utils.generate_filename_from_url
72
+ import gllm_docproc.downloader.html.utils.is_valid_url
73
+ import urllib
74
+ import urllib.parse
75
+ import playwright
76
+ import playwright.sync_api
77
+ import scrapy.http
78
+ import scrapy_playwright
79
+ import scrapy_playwright.page
80
+ import gllm_docproc.downloader.html.utils.clean_url
81
+ import scrapy.crawler
82
+ import scrapy.spiders
83
+ import scrapy.spiders.sitemap
84
+ import scrapy.utils
85
+ import scrapy.utils.sitemap
86
+ import scrapy.linkextractors
87
+ import billiard
88
+ import gllm_docproc.downloader.html.exception.ZyteApiKeyNotProvidedException
89
+ import gllm_docproc.downloader.html.scraper.scraper.spiders.ScrapeSpider
90
+ import gllm_docproc.downloader.html.scraper.scraper.spiders.ZyteScrapeSpider
91
+ import gllm_multimodal.utils
92
+ import gllm_multimodal.utils.audio_to_text_utils
93
+ import gllm_docproc.loader.image.ImageLoader
94
+ import gllm_docproc.loader.txt.TXTLoader
95
+ import gllm_docproc.model.Element
96
+ import gllm_docproc.model.LoaderType
97
+ import _frozen_importlib_external
98
+ import gllm_docproc.indexer.BaseIndexer
99
+ import gllm_datastore
100
+ import gllm_datastore.graph_data_store
101
+ import gllm_datastore.graph_data_store.light_rag_data_store
102
+ import lightrag
103
+ import lightrag.lightrag
104
+ import gllm_datastore.graph_data_store.llama_index_graph_rag_data_store
105
+ import gllm_datastore.graph_data_store.llama_index_neo4j_graph_rag_data_store
106
+ import llama_index
107
+ import llama_index.core
108
+ import llama_index.core.base
109
+ import llama_index.core.base.embeddings
110
+ import llama_index.core.base.embeddings.base
111
+ import llama_index.core.base.llms
112
+ import llama_index.core.base.llms.base
113
+ import llama_index.core.indices
114
+ import llama_index.core.indices.property_graph
115
+ import llama_index.core.indices.property_graph.transformations
116
+ import llama_index.core.schema
117
+ import llama_index.core.vector_stores
118
+ import llama_index.core.vector_stores.types
119
+ import __future__
120
+ import gllm_core.schema
121
+ import gllm_core.utils.concurrency
122
+ import gllm_datastore.core
123
+ import gllm_datastore.core.capabilities
124
+ import gllm_inference.schema
125
+ import tqdm
126
+ import gllm_multimodal.modality_converter.audio_to_text
127
+ import gllm_multimodal.modality_converter.audio_to_text.audio_to_text
128
+ import gllm_multimodal.modality_converter.schema
129
+ import csv
130
+ import base64
131
+ import docx2python
132
+ import docx2python.docx_output
133
+ import docx
134
+ import docx.table
135
+ import docx.text
136
+ import docx.text.paragraph
137
+ import gllm_docproc.loader.html.flat.HTMLFlatLoader
138
+ import gllm_docproc.loader.html.nested.HTMLNestedLoader
139
+ import parsel
140
+ import cairosvg
141
+ import gllm_docproc.loader.html.exception.HtmlLoadException
142
+ import tabulate
143
+ import itertools
144
+ import re.sub
145
+ import w3lib
146
+ import w3lib.html
147
+ import io
148
+ import PIL
149
+ import gllm_docproc.loader.exception.UnsupportedFileExtensionError
150
+ import zipfile
151
+ import adobe
152
+ import adobe.pdfservices
153
+ import adobe.pdfservices.operation
154
+ import adobe.pdfservices.operation.auth
155
+ import adobe.pdfservices.operation.auth.service_principal_credentials
156
+ import adobe.pdfservices.operation.io
157
+ import adobe.pdfservices.operation.io.cloud_asset
158
+ import adobe.pdfservices.operation.io.stream_asset
159
+ import adobe.pdfservices.operation.pdf_services
160
+ import adobe.pdfservices.operation.pdf_services_media_type
161
+ import adobe.pdfservices.operation.pdf_services_response
162
+ import adobe.pdfservices.operation.pdfjobs
163
+ import adobe.pdfservices.operation.pdfjobs.jobs
164
+ import adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job
165
+ import adobe.pdfservices.operation.pdfjobs.params
166
+ import adobe.pdfservices.operation.pdfjobs.params.extract_pdf
167
+ import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type
168
+ import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params
169
+ import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_renditions_element_type
170
+ import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.table_structure_type
171
+ import adobe.pdfservices.operation.pdfjobs.result
172
+ import adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result
173
+ import fitz
174
+ import azure
175
+ import azure.ai
176
+ import azure.ai.documentintelligence
177
+ import azure.ai.documentintelligence.models
178
+ import azure.core
179
+ import azure.core.credentials
180
+ import collections
181
+ import collections.Counter
182
+ import pdfminer
183
+ import pdfminer.high_level
184
+ import pdfminer.layout
185
+ import gllm_docproc.loader.BaseLoader
186
+ import pdfplumber
187
+ import pdfplumber._typing
188
+ import pdfplumber.page
189
+ import pdfplumber.table
190
+ import math
191
+ import numpy
192
+ import tabula
193
+ import tabula.io
194
+ import gllm_datastore.cache
195
+ import gllm_datastore.cache.hybrid_cache
196
+ import gllm_datastore.cache.hybrid_cache.hybrid_cache
197
+ import gllm_datastore.cache.hybrid_cache.utils
198
+ import pptx
199
+ import pptx.chart
200
+ import pptx.chart.chart
201
+ import pptx.shapes
202
+ import pptx.shapes.base
203
+ import pptx.table
204
+ import sys
205
+ import soundfile
206
+ import scipy
207
+ import gllm_docproc.loader.exception.VideoConversionError
208
+ import gi
209
+ import gi.repository
210
+ import gllm_docproc.utils.run_async_in_sync
211
+ import openpyxl
212
+ import openpyxl.cell
213
+ import openpyxl.cell.cell
214
+ import openpyxl.worksheet
215
+ import openpyxl.worksheet.worksheet
216
+ import enum
217
+ import gllm_docproc.parser.BaseParser
218
+ import subprocess
219
+ import tempfile
220
+ import gllm_multimodal.utils.image_utils
221
+ import codecs
222
+ import types
@@ -0,0 +1,216 @@
1
+ Metadata-Version: 2.2
2
+ Name: gllm-docproc-binary
3
+ Version: 0.7.26
4
+ Summary: A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
5
+ Author-email: GenAI SDK Team <gat-sdk@gdplabs.id>
6
+ Requires-Python: <3.13,>=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: bosa-connectors-binary<0.4.0,>=0.3.0
9
+ Requires-Dist: gllm-core-binary<0.4.0,>=0.3.0
10
+ Requires-Dist: gllm-datastore-binary[chroma,elasticsearch]<0.6.0,>=0.5.0
11
+ Requires-Dist: gllm-multimodal-binary[audio]<0.4.0,>=0.3.0
12
+ Requires-Dist: gllm-privacy-binary<0.5.0,>=0.4.0
13
+ Requires-Dist: langchain-text-splitters<0.4.0,>=0.3.2
14
+ Requires-Dist: pandas<3.0.0,>=2.2.3
15
+ Requires-Dist: pydantic<3.0.0,>=2.9.1
16
+ Requires-Dist: tabulate<0.10.0,>=0.9.0
17
+ Requires-Dist: python-magic<0.5.0,>=0.4.27; sys_platform != "win32"
18
+ Requires-Dist: python-magic-bin<0.5.0,>=0.4.14; sys_platform == "win32"
19
+ Provides-Extra: dev
20
+ Requires-Dist: coverage<8.0.0,>=7.4.4; extra == "dev"
21
+ Requires-Dist: mypy<2.0.0,>=1.15.0; extra == "dev"
22
+ Requires-Dist: pre-commit<4.0.0,>=3.7.0; extra == "dev"
23
+ Requires-Dist: pytest<9.0.0,>=8.1.1; extra == "dev"
24
+ Requires-Dist: pytest-asyncio<1.0.0,>=0.23.6; extra == "dev"
25
+ Requires-Dist: pytest-cov<6.0.0,>=5.0.0; extra == "dev"
26
+ Requires-Dist: ruff<1.0.0,>=0.6.7; extra == "dev"
27
+ Provides-Extra: audio
28
+ Requires-Dist: librosa<0.11.0,>=0.10.1; extra == "audio"
29
+ Requires-Dist: tqdm<5.0.0,>=4.66.2; extra == "audio"
30
+ Provides-Extra: docx
31
+ Requires-Dist: docx2python<3.0.0,>=2.8.0; extra == "docx"
32
+ Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == "docx"
33
+ Provides-Extra: html
34
+ Requires-Dist: billiard<5.0.0,>=4.2.1; extra == "html"
35
+ Requires-Dist: firecrawl-py<5.0.0,>=4.3.6; extra == "html"
36
+ Requires-Dist: html-to-markdown<2.0.0,>=1.9.0; extra == "html"
37
+ Requires-Dist: playwright<2.0.0,>=1.40.0; extra == "html"
38
+ Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == "html"
39
+ Requires-Dist: scrapy-playwright<0.1.0,>=0.0.33; extra == "html"
40
+ Requires-Dist: scrapy-zyte-api<1.0.0,>=0.12.2; extra == "html"
41
+ Requires-Dist: zyte-api<1.0.0,>=0.4.8; extra == "html"
42
+ Provides-Extra: html-svg
43
+ Requires-Dist: cairosvg<3.0.0,>=2.8.2; extra == "html-svg"
44
+ Provides-Extra: image
45
+ Requires-Dist: aioresponses<1.0.0,>=0.7.0; extra == "image"
46
+ Requires-Dist: boto3<2.0.0,>=1.38.10; extra == "image"
47
+ Requires-Dist: pillow<12.0.0,>=11.2.1; extra == "image"
48
+ Provides-Extra: kg
49
+ Requires-Dist: asyncpg<1.0.0,>=0.30.0; extra == "kg"
50
+ Requires-Dist: gllm-datastore-binary[kg]<0.6.0,>=0.5.0; extra == "kg"
51
+ Requires-Dist: lightrag-hku<2.0.0,>=1.4.6; extra == "kg"
52
+ Requires-Dist: llama-index-embeddings-openai<1.0.0,>=0.3.0; extra == "kg"
53
+ Requires-Dist: llama-index-llms-openai<1.0.0,>=0.3.0; extra == "kg"
54
+ Provides-Extra: pdf
55
+ Requires-Dist: azure-ai-documentintelligence<2.0.0,>=1.0.0b3; extra == "pdf"
56
+ Requires-Dist: jpype1<2.0.0,>=1.5.0; extra == "pdf"
57
+ Requires-Dist: pdfminer-six<20250000,>=20231228; extra == "pdf"
58
+ Requires-Dist: pdfplumber<1.0.0,>=0.11.4; extra == "pdf"
59
+ Requires-Dist: pdfservices-sdk<5.0.0,>=4.0.0; extra == "pdf"
60
+ Requires-Dist: pymupdf<2.0.0,>=1.24.10; extra == "pdf"
61
+ Requires-Dist: tabula-py<3.0.0,>=2.9.3; extra == "pdf"
62
+ Provides-Extra: pii
63
+ Requires-Dist: langdetect<2.0.0,>=1.0.0; extra == "pii"
64
+ Requires-Dist: torch<3.0.0,>=2.0.0; extra == "pii"
65
+ Provides-Extra: pptx
66
+ Requires-Dist: python-pptx<2.0.0,>=1.0.2; extra == "pptx"
67
+ Provides-Extra: video
68
+ Requires-Dist: PyGObject==3.50.0; sys_platform != "win32" and extra == "video"
69
+ Requires-Dist: numpy<2.0.0,>=1.26.0; extra == "video"
70
+ Requires-Dist: scipy<2.0.0,>=1.15.0; extra == "video"
71
+ Requires-Dist: soundfile<0.14.0,>=0.13.1; extra == "video"
72
+ Provides-Extra: xlsx
73
+ Requires-Dist: openpyxl<4.0.0,>=3.0.10; extra == "xlsx"
74
+
75
+ # GLLM Docproc
76
+
77
+ ## Description
78
+ A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
79
+
80
+ ---
81
+
82
+ ## Installation
83
+
84
+ ### Prerequisites
85
+
86
+ Mandatory:
87
+ 1. Python 3.11+ — [Install here](https://www.python.org/downloads/)
88
+ 2. pip — [Install here](https://pip.pypa.io/en/stable/installation/)
89
+ 3. uv — [Install here](https://docs.astral.sh/uv/getting-started/installation/)
90
+ 4. gcloud CLI (for authentication) — [Install here](https://cloud.google.com/sdk/docs/install), then log in using:
91
+ ```bash
92
+ gcloud auth login
93
+ ```
94
+
95
+ ---
96
+
97
+ ### Install from Artifact Registry
98
+
99
+ This requires authentication via the `gcloud` CLI.
100
+
101
+ 1. Export token
102
+ ```
103
+ export GCLOUD_ACCESS_TOKEN="$(gcloud auth print-access-token)"
104
+ ```
105
+
106
+ 2. Configure the index in your `pyproject.tom;`
107
+ ```
108
+ [[tool.uv.index]]
109
+ name = "gen-ai-internal"
110
+ url = "https://oauth2accesstoken:${GCLOUD_ACCESS_TOKEN}@glsdk.gdplabs.id/gen-ai-internal/simple/"
111
+ ```
112
+
113
+ 3. Add the dependency
114
+ ```
115
+ uv add gllm-docproc
116
+ ```
117
+
118
+ ---
119
+
120
+ ## Local Development Setup
121
+
122
+ ### Prerequisites
123
+
124
+ 1. Python 3.11+ — [Install here](https://www.python.org/downloads/)
125
+ 2. pip — [Install here](https://pip.pypa.io/en/stable/installation/)
126
+ 3. uv — [Install here](https://docs.astral.sh/uv/getting-started/installation/)
127
+ 4. gcloud CLI — [Install here](https://cloud.google.com/sdk/docs/install), then log in using:
128
+
129
+ ```bash
130
+ gcloud auth login
131
+ ```
132
+ 5. Git — [Install here](https://git-scm.com/downloads)
133
+ 6. Access to the [GDP Labs SDK GitHub repository](https://github.com/GDP-ADMIN/gl-sdk)
134
+
135
+ ---
136
+
137
+ ### 1. Clone Repository
138
+
139
+ ```bash
140
+ git clone git@github.com:GDP-ADMIN/gl-sdk.git
141
+ cd gl-sdk/libs/gllm-docproc
142
+ ```
143
+
144
+ ---
145
+
146
+ ### 2. Setup Authentication
147
+
148
+ Set the following environment variables to authenticate with internal package indexes:
149
+
150
+ ```bash
151
+ export UV_INDEX_GEN_AI_INTERNAL_USERNAME=oauth2accesstoken
152
+ export UV_INDEX_GEN_AI_INTERNAL_PASSWORD="$(gcloud auth print-access-token)"
153
+ export UV_INDEX_GEN_AI_USERNAME=oauth2accesstoken
154
+ export UV_INDEX_GEN_AI_PASSWORD="$(gcloud auth print-access-token)"
155
+ ```
156
+
157
+ ---
158
+
159
+ ### 3. Quick Setup
160
+
161
+ Run:
162
+
163
+ ```bash
164
+ make setup
165
+ ```
166
+
167
+ ---
168
+
169
+ ### 4. Activate Virtual Environment
170
+
171
+ ```bash
172
+ source .venv/bin/activate
173
+ ```
174
+
175
+ ---
176
+
177
+ ## Local Development Utilities
178
+
179
+ The following Makefile commands are available for quick operations:
180
+
181
+ ### Install uv
182
+
183
+ ```bash
184
+ make install-uv
185
+ ```
186
+
187
+ ### Install Pre-Commit
188
+
189
+ ```bash
190
+ make install-pre-commit
191
+ ```
192
+
193
+ ### Install Dependencies
194
+
195
+ ```bash
196
+ make install
197
+ ```
198
+
199
+ ### Update Dependencies
200
+
201
+ ```bash
202
+ make update
203
+ ```
204
+
205
+ ### Run Tests
206
+
207
+ ```bash
208
+ make test
209
+ ```
210
+
211
+ ---
212
+
213
+ ## Contributing
214
+
215
+ Please refer to the [Python Style Guide](https://docs.google.com/document/d/1uRggCrHnVfDPBnG641FyQBwUwLoFw0kTzNqRm92vUwM/edit?usp=sharing)
216
+ for information about code style, documentation standards, and SCA requirements.
@@ -0,0 +1,168 @@
1
+ gllm_docproc.cpython-311-darwin.so,sha256=EP8eyjC9FVhgs2OAC4buLIRBmn1SdwrHLvX4U2zTDsY,6082752
2
+ gllm_docproc.pyi,sha256=8uCMmHYt0XeJoYfZ98TNjUA2J-s727EJk3w0wHjmMG0,7148
3
+ gllm_docproc/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ gllm_docproc/chunker/__init__.pyi,sha256=eA7-yNoiTmdgdk8KUEiFSMNu4nFm7J-D7BV-7J4_QgQ,80
5
+ gllm_docproc/chunker/base_chunker.pyi,sha256=qvUhHYl0Mj-aRDW_uf59gWCWyIyUtnDayZaZTR4-C9I,1132
6
+ gllm_docproc/chunker/structured_element/__init__.pyi,sha256=FGOqqpZRBIMbsHGRiS3DehxJ3dG6wOYTBYThVqytiSI,133
7
+ gllm_docproc/chunker/structured_element/chunk_enricher.pyi,sha256=78w45UE31wcXOR3Xl4dtYwzcgPQS5EHxKuodfEpOAtY,1720
8
+ gllm_docproc/chunker/structured_element/structured_element_chunker.pyi,sha256=Ys7vSVDUOj2cMOYClnUpJbNtzSfzakDrHFfZTSdSeb4,4077
9
+ gllm_docproc/chunker/table/__init__.pyi,sha256=oYBcaQP91xP32ITWWhYezzpYkIytbZ_P9VrCpmbQmjM,159
10
+ gllm_docproc/chunker/table/table_chunker.pyi,sha256=Nb4xDLl5ZR_N8ilRmHZyviNaqHEbeuj0WcNL7owVrP0,1853
11
+ gllm_docproc/converter/__init__.pyi,sha256=t8FWQ6ivPnaIHXKD9tNlaelmtXIguvEPwgNUobwHobE,88
12
+ gllm_docproc/converter/base_converter.pyi,sha256=iyP3RzkxeQvkn0Pmonxk6mljgDoJVYpQ6ketIhZXDyA,427
13
+ gllm_docproc/data_generator/__init__.pyi,sha256=uk6fwUT9e0l7fBr-9YAckgDIByosjrHU_kUJmQgSLHE,522
14
+ gllm_docproc/data_generator/base_data_generator.pyi,sha256=Asap_mx_bWBGPWaAEzLZtRDpFOe2JxcYm2APgDjgB9I,772
15
+ gllm_docproc/data_generator/image_data_generator/__init__.pyi,sha256=w19oyxFACC7FBV8nAz7zRmqlJuT7JuI3oRjVodxIvoM,402
16
+ gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi,sha256=EB_3-dtgwxa6kHIdO3erblT27UTbI2w_gyJE1lyY4a4,2120
17
+ gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi,sha256=10Wf94Gjq54me8VF8v4xEIW1qXSCKr5xH4BXYFT3pnU,2720
18
+ gllm_docproc/data_generator/pii_data_generator/__init__.pyi,sha256=VaDnIoqdremFO6xCyLLwEzT9Aq6_4jT-Hbi_8OXFpTI,122
19
+ gllm_docproc/downloader/__init__.pyi,sha256=qWnleQX8bx1jJRs9wZFWc4cd419ePU9LmCZdfYw1q0k,319
20
+ gllm_docproc/downloader/base_downloader.pyi,sha256=BBpGEVCPAkYHXgpPWRBYCY5iQO-KVpxnuMhJDMJ4ao0,813
21
+ gllm_docproc/downloader/direct_file_url_downloader.pyi,sha256=i6B6n47YXu933r889PIvmIGI280E5wTYO_0NBbcGDqk,1822
22
+ gllm_docproc/downloader/google_drive_downloader.pyi,sha256=6A_RUCoioTsk81p8Vfuz4DMP2-Q9YYDoanKRkUyUSaE,1553
23
+ gllm_docproc/downloader/html/__init__.pyi,sha256=DPySyZD5Bdnf_RQOI7srAqtV1HPRLqDdERUlF4lGtKs,442
24
+ gllm_docproc/downloader/html/firecrawl_downloader.pyi,sha256=CYs34A-DaTtUy_2WtyWaFYFH0zO70YFXbhD-bS0uLGg,2423
25
+ gllm_docproc/downloader/html/html_downloader.pyi,sha256=cm8tBuI-F9Uo96bWkm3_-T98CpUXCzCn0yp-uRAxCc4,5851
26
+ gllm_docproc/downloader/html/playwright_downloader.pyi,sha256=W-vVSwKfdNeYhmXC05ne7RldPhdp2zK3bLMmgCIc0yc,3252
27
+ gllm_docproc/downloader/html/requests_downloader.pyi,sha256=CKXch7MYSw1tDEFjpEy_-NCWOVpKTbFmmkYIuxMkFvA,2355
28
+ gllm_docproc/downloader/html/exception/__init__.pyi,sha256=ZEV6EjWuDZ6Rr-mRm3gad8DuvmNivr2v8fYVXxFB-ks,286
29
+ gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi,sha256=MrQnR6wuxec85q-54QZ27kCnw4p4CUj8hawfAMldVCE,607
30
+ gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi,sha256=gjX1ovxTSMQ3dCgYA-434Hp-rURVB5xEtlfMVvpsAmU,589
31
+ gllm_docproc/downloader/html/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
+ gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi,sha256=6vsdbmBsIN-MfxuAII1M2dYBvBI3uKtkEd7M6JRGx2M,2112
33
+ gllm_docproc/downloader/html/scraper/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi,sha256=oPIUQoIMRMTuyUOMvMZgzRjyVybMYNqlrDmD5ewMWVI,658
35
+ gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi,sha256=suD5on_SoYZkeaIKJGM_5fg_lKwY2Z1sIbuMwsMAHjo,1339
36
+ gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi,sha256=THBhGOCYmdZJsN9yHQOdMUJh85HPtmb9K1x5d2LAOsQ,1120
37
+ gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi,sha256=C3-DzmzYXDv8j4ROoP2AEgOQ6phTI9dAmx5abPXXFNA,2544
38
+ gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi,sha256=SJgRvKNw-wVsc7oCG2z4OUOygIVRcbYosv4IG2ZIQH0,2567
39
+ gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi,sha256=zm1KgUJtKNO5gFJPzZ9liAVO9sDSQbWjY0eiqUnSXE8,947
40
+ gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi,sha256=MKOTG0t-QPH7Gnw7obpryskQIBRbhXPidmwEyDcvjrU,2082
41
+ gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi,sha256=SqzUlfsv84MPcCEgvmAoRdgOQvTgbF-Su0Pl2NxKgZQ,2138
42
+ gllm_docproc/downloader/html/utils/__init__.pyi,sha256=YpN23jkvKEM81BO1AYj_TOyl5n3kmx3AklFFlzwhpcg,205
43
+ gllm_docproc/downloader/html/utils/web_utils.pyi,sha256=iGuo1nclEi4vK9JkgqBtPmiPPpiCBxvnxntLTckzd0Y,1395
44
+ gllm_docproc/dpo_router/__init__.pyi,sha256=nG-PYRMjqEvheg7rIXE6VFzjsQDLNQIfZae9_38eIOs,233
45
+ gllm_docproc/dpo_router/base_dpo_router.pyi,sha256=gVC3VUmVXfe3byyfugafEMQLQ-rV2S24-UN0t1_NvNs,579
46
+ gllm_docproc/dpo_router/loader_router.pyi,sha256=mKvvG0qqTGSjOPNRs_ao1GO6RucWDlkLoVoNRztIB74,2359
47
+ gllm_docproc/dpo_router/parser_router.pyi,sha256=6VMTLAKDWIr6z1xZoZMpUU0yERnixPpT6gXgnnVaCO8,1933
48
+ gllm_docproc/housekeeping/__init__.pyi,sha256=hCHqIldkCC8OetWprbipx1XwenES9iGHlaNRTwzqR4g,100
49
+ gllm_docproc/housekeeping/base_housekeeping.pyi,sha256=I8FXfPtsDhBR6z2mxsjAvij2T4RewLl6HmCGZ03skjw,416
50
+ gllm_docproc/indexer/__init__.pyi,sha256=2rF0XCyWpcpYif-We8ZC-ZpCAzLpMoClEmXhNVh2-s8,100
51
+ gllm_docproc/indexer/base_indexer.pyi,sha256=Tuj7EAMNrTkaU43Pn2GBZ-b_d4ztQLVqDHhFyxClj-s,1052
52
+ gllm_docproc/indexer/graph/__init__.pyi,sha256=iy99lSSfOgkuK2mbSup10kqqpgIfd5esAM57BSFpJ0w,310
53
+ gllm_docproc/indexer/graph/graph_rag_indexer.pyi,sha256=slEIIYp_JTc_SIKkAQlNnnDZiS34mL85jlbCcT3zF-U,368
54
+ gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi,sha256=OJq1Qj6hABHiqU-ZdkuzSt1DK_BNv41dakIPruQ9RE4,4191
55
+ gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi,sha256=6APvTbXCTWvgpnxkwH86R153AsVmk1kub1lpYX9Uejs,4818
56
+ gllm_docproc/indexer/vector/__init__.pyi,sha256=AgWiL44h98hoAljuY48pnoiSs-gO4qx5zja_FLSjw68,97
57
+ gllm_docproc/indexer/vector/vector_db_indexer.pyi,sha256=u9Bu_DGbZEFGDR7DtFLNFzR0RB_b6A5tb6TnKqgLoQk,2360
58
+ gllm_docproc/loader/__init__.pyi,sha256=STxqJvk7NyqQtJ1pmBycKaKgt2aNYaFfpXoTqMCzY9g,156
59
+ gllm_docproc/loader/base_loader.pyi,sha256=qf6exWFn7ShRLdHl0LoyOIRyGV74BNA57RFavtX1GQw,1322
60
+ gllm_docproc/loader/loader_utils.pyi,sha256=GX7HbduYkCeoBYOHBctlIurNWgclbePePEv9BYQ_zFI,1828
61
+ gllm_docproc/loader/pipeline_loader.pyi,sha256=w8XMX-3DOJ6RN5y8pU-vW1N4HzlopwuJLQgGC4GdZ3A,1771
62
+ gllm_docproc/loader/audio/__init__.pyi,sha256=dNPtbcP9rdG7Dr_jYcUEZ_rpyK2X0LdFC9S_JG8PcpQ,80
63
+ gllm_docproc/loader/audio/audio_loader.pyi,sha256=1s7cGsQdz3KSxMc2AAHYqKHxkFlnqkFTmPAYQgPDkMM,2243
64
+ gllm_docproc/loader/csv/__init__.pyi,sha256=tYZk9vD4H1dLn6AkYaOZK5YQs4nhLLUtdWdHnm9rQ0g,84
65
+ gllm_docproc/loader/csv/pandas_loader.pyi,sha256=VYWUWQhN0dF1K82r6h4Tv4liW3lpj77xBuSMAN6R2fQ,2808
66
+ gllm_docproc/loader/docx/__init__.pyi,sha256=R69OD2iJ7-M9mn6Nm3_iqIY1BxHYodHemyYuWE2LMyg,303
67
+ gllm_docproc/loader/docx/docx2python_loader.pyi,sha256=-9ZIM9KKGwHClCjaPbaTrCIcC4_im6W1wMrHeHvcJS0,2523
68
+ gllm_docproc/loader/docx/python_docx_loader.pyi,sha256=SBp4IqD19zG-MJ_doOrW7JIbbtlPBvn3--uCWEv7loI,2016
69
+ gllm_docproc/loader/docx/python_docx_table_loader.pyi,sha256=RokUQ6a6k-pDbGTqUJyoVVG89Wo5G_dfSUjshFYHl2A,1721
70
+ gllm_docproc/loader/exception/__init__.pyi,sha256=9mD9OB7BGgG_tEVAMw0ahzIEu6dnVqyiEqcxr5PBoCA,259
71
+ gllm_docproc/loader/exception/unsupported_file_extension_error.pyi,sha256=ux-G38V35LP3Y__I98ccTv5ekoOdOXUKK2UQh0WYd3g,261
72
+ gllm_docproc/loader/exception/video_conversion_error.pyi,sha256=wUhhMh6SCMFcT0G9pvHdlCRJ7jYvznscR8AIuATdfL4,430
73
+ gllm_docproc/loader/html/__init__.pyi,sha256=rBA2i4IXnLnpLPi0SHvGtUMVgJ1pFMV-48pIrS-1cKQ,239
74
+ gllm_docproc/loader/html/html_base_loader.pyi,sha256=_cQCV_DGY5XkfU0VlVirh2im9bFO0WK_uE4RZCKe7z8,1160
75
+ gllm_docproc/loader/html/exception/__init__.pyi,sha256=itSjlJJFFC9Z9ZYdby5iW1yd77eKws0tEwOkQmnVHm0,105
76
+ gllm_docproc/loader/html/exception/html_load_exception.pyi,sha256=7umMZsJSHs3H-gyIsxRtaGSmvpVskU25MEGS6DLy9-w,254
77
+ gllm_docproc/loader/html/flat/__init__.pyi,sha256=Q7WDbp0ZZ1ATcvn8I4Ix1OBJYGXajHMXxXceAmig8c0,93
78
+ gllm_docproc/loader/html/flat/html_flat_base_handler.pyi,sha256=TuLF8ZYC3PqAznNJMMOlO5cLxvOxXT-ZTnRd9ToeWdg,3284
79
+ gllm_docproc/loader/html/flat/html_flat_loader.pyi,sha256=SBlBs2lrC0Xs4YfaWPQeCgCVtNwcl6yLXRWRe1mtspg,1781
80
+ gllm_docproc/loader/html/flat/html_flat_merger.pyi,sha256=H73POQph7FT-xdATJhDnS4zMdc0rg4a5Jui8rMF0ykc,1455
81
+ gllm_docproc/loader/html/nested/__init__.pyi,sha256=pd14asYg0pcfwnlLkvE4rpfIKFphdg86Kfy_Jun74ho,101
82
+ gllm_docproc/loader/html/nested/dictionary_utils.pyi,sha256=TNcvYtYUv41pZeMaQVX3WW9gIe0elI9l_MmzIWzcj64,1494
83
+ gllm_docproc/loader/html/nested/html_nested_base_handler.pyi,sha256=YBlhfDq_Mq0eYiA-8XDIH1EnlqxnSm6Tnzb6Af3dMwY,4734
84
+ gllm_docproc/loader/html/nested/html_nested_element_handler.pyi,sha256=wHJJDl28NgfbsKCD8UgsucWth4mXVmMMdp0c2RaeEtc,916
85
+ gllm_docproc/loader/html/nested/html_nested_loader.pyi,sha256=YXXo34KJ9WMzYdzSng0a6fq9THf909-CkoTJ81pdLKs,917
86
+ gllm_docproc/loader/html/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ gllm_docproc/loader/html/utils/flat_table_utils.pyi,sha256=NLYp_nth5uYIq3EV6T1BqDD-SB6wsBaMiafWa5OZo5I,1949
88
+ gllm_docproc/loader/html/utils/html_utils.pyi,sha256=D4UaWblGJOSOnjl_HNuxXp7fE6wdVKJ2NU1jbgWdhp8,2335
89
+ gllm_docproc/loader/html/utils/removed_components.pyi,sha256=UGUFq-rUlufUbs7uAp_FbDNzNVe_f4YVviwx_TLVSOg,2090
90
+ gllm_docproc/loader/html/utils/string_utils.pyi,sha256=OrXkM4n53mS5GIrlZJNCfQQLP8CMULt4yVyogNW38r4,1092
91
+ gllm_docproc/loader/html/utils/table_utils.pyi,sha256=vbfesENns4ozUOhEH9pDVRgCTS3wKV5dIhYIuJu1V5w,2740
92
+ gllm_docproc/loader/image/__init__.pyi,sha256=Eg6DMkEWfCI1Djmjw3-ZLJ4ydY19Ax-sjzpcz9FNkWs,80
93
+ gllm_docproc/loader/image/image_loader.pyi,sha256=FygZyED2NySP1TG2RfhMJwMF8IaiJGWIGVbCiKwudxM,2463
94
+ gllm_docproc/loader/json/__init__.pyi,sha256=FwvHdSzGzNaPGTw0ge_-YVqBoPj5YQtLJLs3JxdNvEE,109
95
+ gllm_docproc/loader/json/json_elements_loader.pyi,sha256=9nXNC7X3grByacMqrm9jUf9tFbGSd7YA2pUvQxsng6s,1466
96
+ gllm_docproc/loader/pdf/__init__.pyi,sha256=sVzBJIHISSMMf59RhztY9o0fP6f6sc7Dg2_ftJxqQEo,1276
97
+ gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi,sha256=KOxp2fKL9ijX83MGjFZgYlEVKiPKDcUveOvzOiNtsqo,1691
98
+ gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi,sha256=mWvju7jHJNC6cpzxT0SfAoAWuZpTKufzCAP_zKgpSNo,2614
99
+ gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi,sha256=-az2yXu3NqE5_LuB2QW_z9x3Nzfjxjfrkyj1RBXP25w,2628
100
+ gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi,sha256=ghQ71Fvjgh707YONVFfnV4hlQAECUfYO2LGmXFvWdfQ,1660
101
+ gllm_docproc/loader/pdf/pdf_loader_utils.pyi,sha256=u7FGjtY0nS7AEGFG2rP_xB8d9vf-7j023MoQ_s3AhfY,2938
102
+ gllm_docproc/loader/pdf/pdf_miner_loader.pyi,sha256=J3ktjrct3_OzXoD8EU5R8N7UgmHBWJwUUxKExTUJxbo,2216
103
+ gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi,sha256=l1JY1M8JvAVlAK73VDhJCLPWp1lAf9QmnUmSPCp6udI,1518
104
+ gllm_docproc/loader/pdf/pdf_page_loader.pyi,sha256=UWMIDigAamNlfTUcuzKvUsQjU4Qbf7dEBsPLH1mRKV0,2030
105
+ gllm_docproc/loader/pdf/pdf_plumber_loader.pyi,sha256=Sltw_bo0-tnDTSILuOAenEgUIDBSn4TQFB32G97G9nA,2017
106
+ gllm_docproc/loader/pdf/pymupdf_loader.pyi,sha256=m9eve25n7K2VKqTAzivFXTwEwtBlkXxN2ANbY8Hgkg4,3443
107
+ gllm_docproc/loader/pdf/pymupdf_span_loader.pyi,sha256=Rmrrc-VIQuceejRJ0WbWDB2YMwcPLEgsKXG-gDYuUQk,3462
108
+ gllm_docproc/loader/pdf/pymupdf_utils.pyi,sha256=v-K2r0CuQ8aJbKRRmXqtutedQ3-cwlAg2sSMquk19r0,3601
109
+ gllm_docproc/loader/pdf/tabula_loader.pyi,sha256=8y6gHpttx44u4ryovzsW4O7QfvBAMxHbnmaHOBz23I8,1756
110
+ gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi,sha256=tjuyRXvE50V4sBpTCGLDPJv0Vo0_noZqoyhoP8CUUdM,2008
111
+ gllm_docproc/loader/pptx/__init__.pyi,sha256=ZTVZxVIkdkysesUuZ06EaodddLVJwcaaPvIZdQg2t8E,101
112
+ gllm_docproc/loader/pptx/python_pptx_loader.pyi,sha256=rxjc0e5ksP_ZY7EJQSx_YJLsoQ-W5Cd_OyfBIfL_1go,2532
113
+ gllm_docproc/loader/txt/__init__.pyi,sha256=0Lz4_zBg0emZiL62MCZRBHTB_ebiGWDE7QlXf3SwwrU,72
114
+ gllm_docproc/loader/txt/txt_loader.pyi,sha256=Zyb7a_7MK00BJA2RcycXQdtl7YeBsBlnOHv3g63ktTA,2219
115
+ gllm_docproc/loader/video/__init__.pyi,sha256=THvj_OJ341LQ_XeUcf_mU-Gr8cpTbrIcr7LVESLDGMk,121
116
+ gllm_docproc/loader/video/video_loader_utils.pyi,sha256=IMKY4p3WdVZni4cC6WAvVQQLKHmeDhYeOYPMWkVQrok,4717
117
+ gllm_docproc/loader/video/video_transcript_loader.pyi,sha256=bhd4xtTRj94_--cXL-_oXK3pq85ML0rR0L-EhMz-A-0,3150
118
+ gllm_docproc/loader/xlsx/__init__.pyi,sha256=HbXl6OsK-Ws7NGrz5VOVbBEm-6TqzEF960ekCanbh_k,92
119
+ gllm_docproc/loader/xlsx/openpyxl_loader.pyi,sha256=0pc4Fh2CY1uox515l9dMKPPqboVa_HdhSoTSa_zo8H8,1926
120
+ gllm_docproc/model/__init__.pyi,sha256=HxFPOznQRhtZrJDS41e5nZhqRJFYy54sgYsbCIalwPk,318
121
+ gllm_docproc/model/element.pyi,sha256=V2jE90UFpFje9SHAYN10bEMAUV9IPFxBS6aA66C_jcA,1058
122
+ gllm_docproc/model/element_metadata.pyi,sha256=YqbUryhgAkqMTdncZEIq3VQq_uGmTgTI__A__O3iD4I,811
123
+ gllm_docproc/model/loader_type.pyi,sha256=KElt3ZoKK-8KspLNAgM7d_rEzSRL18orzUgVlpoMVBM,413
124
+ gllm_docproc/model/media.pyi,sha256=8LkDAtgan-ComRLnp6_Kblghovl18hwR6S71K6jnXpM,1660
125
+ gllm_docproc/model/parser_type.pyi,sha256=AVv-VVsLXdEYXWEdXdZAAoIkOn1ocUeNoNfsOGz9mME,383
126
+ gllm_docproc/parser/__init__.pyi,sha256=jY_LxmoS3ye6-3pZM77x0ml-s3d3cARGBATPHDxoXqU,156
127
+ gllm_docproc/parser/base_parser.pyi,sha256=F8vVW8rHMt-6XknlgQy3qEv32rHVg3gUhWABzNsGWhs,1135
128
+ gllm_docproc/parser/pipeline_parser.pyi,sha256=emQW79pLtb3kU1FUig2SWGaA-e8Syw8IXFH1yOUmcPI,1237
129
+ gllm_docproc/parser/document/__init__.pyi,sha256=I-qd3vokhbX6siPe8kvwjs1kWUsFz5rzYS5nkwu_H3g,324
130
+ gllm_docproc/parser/document/docx_parser.pyi,sha256=YAzEoZWVcL0_MGHj6Tlx0OFII3F4XWD3xomMvvMO2kI,1519
131
+ gllm_docproc/parser/document/pdf_parser.pyi,sha256=4OwnSCrl6-pQehpEjQ-vm8Ho66I8JR1BlvidVvzxYig,1511
132
+ gllm_docproc/parser/document/pptx_parser.pyi,sha256=bHGUqVm341W7pggYdZUgU_fWs25WJ_hVHPDs2MHl0k8,1590
133
+ gllm_docproc/parser/document/txt_parser.pyi,sha256=vfLI61LCBTRALu2fG-TEnYEksuFxaiz3A0F3kqbhPN8,863
134
+ gllm_docproc/parser/document/xlsx_parser.pyi,sha256=DJtuJxYJDsWp1DE3k9kSYqObVyYoAb7KOjWow_yqvcE,1068
135
+ gllm_docproc/parser/html/__init__.pyi,sha256=Tq5phnPf8IxLEjAvAS0O1lc6hoCFK8yYtLiHc1q-plM,194
136
+ gllm_docproc/parser/html/flat/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
+ gllm_docproc/parser/html/flat/html_flat_parser.pyi,sha256=1Q5tEzaZ7Fro1V3L3ky_IRubkLDMeK6Rl6J8aDGgCfE,1253
138
+ gllm_docproc/parser/html/nested/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
+ gllm_docproc/parser/html/nested/html_json_processor.pyi,sha256=MyBi2fAd-qVQrVYeuOi0Mk17MEyiGfltbobzd8506Yw,6319
140
+ gllm_docproc/parser/html/nested/html_nested_parser.pyi,sha256=ZK2dkti7g664LdnlQcIAmtLVCE1rZX66A1VA124PH7A,990
141
+ gllm_docproc/parser/html/nested/nested_element.pyi,sha256=Ym4OEkrT58XFpaMA_gWUeopbxrDPoWanFB0FE5UZLHc,1246
142
+ gllm_docproc/parser/image/__init__.pyi,sha256=5qO_8zVevpSuph9BtfUi_CBk2xlXFcxYh786KWHz7p8,335
143
+ gllm_docproc/parser/image/image_mime_normalization_parser.pyi,sha256=4jOHOVQn5F8jNX2h6lgoZ1t5rnE9dqPmQ2uLnUvpp5U,2044
144
+ gllm_docproc/parser/image/image_plain_small_filter_parser.pyi,sha256=k7FstB7VBgJ-80AUnAMGg4QBe1rZsp2l91zWOhoQRIU,1891
145
+ gllm_docproc/parser/table/__init__.pyi,sha256=MPFJ33qPHic7DgS8QwOI2-0gw9SwhsFjmHDkvGsYHGc,109
146
+ gllm_docproc/parser/table/table_caption_parser.pyi,sha256=VU1y8vg4JfXb-NWvnZP4QiFxDD9mcss_xVFxVyMm10E,2704
147
+ gllm_docproc/request_handler/__init__.pyi,sha256=hJJrWfbAdRUH8jZfjW4Q2PqNhjo6rvK8BTDVCbBoJ28,109
148
+ gllm_docproc/request_handler/base_request_handler.pyi,sha256=kxNm-Qzg5OFL-kqP3PNtQGwn1d3Na2eXNqk6D28fiJ8,445
149
+ gllm_docproc/response_handler/__init__.pyi,sha256=Ch5ht8cCWZqvePwp7Az9Uq-18ZPpghBwB7P8UnmUINw,113
150
+ gllm_docproc/response_handler/base_response_handler.pyi,sha256=vaozBa_1MEoW4EfSC_vLvvI5NrpbxAo2061uHqfrlGM,1249
151
+ gllm_docproc/utils/__init__.pyi,sha256=4JRXAdy2jD9KY041P20tUSOC7ldcL5ouxmNw0wH3Q-8,97
152
+ gllm_docproc/utils/async_utils.pyi,sha256=fvIjvXEQtHNTtEv0-4OuGfClP3TtJ_PBdXxwCpwCDfU,637
153
+ gllm_docproc/utils/file_utils.pyi,sha256=OEw7le7YUxc1HgUuYTzGW9Ml-XBkp56OsWsJW38_95I,2608
154
+ gllm_docproc/utils/html_constants.pyi,sha256=yjsR3x6UlOG8FV09_Ih_HlxZGGZ80GIraBW-mUU05vk,2826
155
+ gllm_docproc/validator/__init__.pyi,sha256=GqonAdrVx64F886RNEx8jC9umhPZgMEeSa952gcLGAA,499
156
+ gllm_docproc/validator/base_validator.pyi,sha256=JzyZQpMhM8khGgQ_j1AP04nJdYpLmLRw_s0AGqTYepY,1585
157
+ gllm_docproc/validator/character_count_validator.pyi,sha256=ovIc2_zCuSYworo-GYz6Ota9ugievC1RWzd-ZPh6Po8,1481
158
+ gllm_docproc/validator/file_size_validator.pyi,sha256=REDnAJzJxLETWYHQtMVPMRiJ_w31Mj2WsCX9Ot2190I,1196
159
+ gllm_docproc/validator/page_count_validator.pyi,sha256=usZslugBmm5rHGElvNzjD9jhJvQ-8MXCZyIghrGYZ5g,1284
160
+ gllm_docproc/validator/pipeline_validator.pyi,sha256=ugtKNk1WgYZjS7GIFjyQzz_AbwdTfNJaWgbzCuM7dP4,1890
161
+ gllm_docproc/validator/model/__init__.pyi,sha256=VYI9jyIs0eIjZYofewTUMzAPziAe2XH5f23QAhK3NoE,176
162
+ gllm_docproc/validator/model/validator_input.pyi,sha256=uefNnvzQdIJZ6cuehl6zwF21zYA5Hd6W27lLp6WgKB8,1833
163
+ gllm_docproc/validator/model/validator_result.pyi,sha256=AkTTC67AWfl6U-z75Q8mOb56ozbguI5uJLn4B6jSAWM,715
164
+ gllm_docproc.build/.gitignore,sha256=aEiIwOuxfzdCmLZe4oB1JsBmCUxwG8x-u-HBCV9JT8E,1
165
+ gllm_docproc_binary-0.7.26.dist-info/METADATA,sha256=-ugtJdJKM8ZOfVKJFaOV3T5W-A2k8ayUfdckmE1okkQ,6488
166
+ gllm_docproc_binary-0.7.26.dist-info/WHEEL,sha256=hF0GNNNOCwRi0V1KNSXUmTJBNSRZ_92NEYrTCPhm6WA,104
167
+ gllm_docproc_binary-0.7.26.dist-info/top_level.txt,sha256=FzUqfBCCn6DsB0K9QO5mNXrR2VbqKu__KhFzHgHVz90,13
168
+ gllm_docproc_binary-0.7.26.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: Nuitka (2.6.9)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-cp311-macosx_13_0_arm64
5
+
@@ -0,0 +1 @@
1
+ gllm_docproc