gllm-docproc-binary 0.7.22__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +28 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +15 -0
- gllm_docproc/data_generator/__init__.pyi +5 -0
- gllm_docproc/data_generator/base_data_generator.pyi +18 -0
- gllm_docproc/data_generator/image_data_generator/__init__.pyi +4 -0
- gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi +40 -0
- gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi +51 -0
- gllm_docproc/data_generator/pii_data_generator/__init__.pyi +1 -0
- gllm_docproc/downloader/__init__.pyi +5 -0
- gllm_docproc/downloader/base_downloader.pyi +19 -0
- gllm_docproc/downloader/direct_file_url_downloader.pyi +40 -0
- gllm_docproc/downloader/google_drive_downloader.pyi +36 -0
- gllm_docproc/downloader/html/__init__.pyi +6 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/firecrawl_downloader.pyi +49 -0
- gllm_docproc/downloader/html/html_downloader.pyi +114 -0
- gllm_docproc/downloader/html/requests_downloader.pyi +46 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +28 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +66 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +43 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +39 -0
- gllm_docproc/dpo_router/__init__.pyi +5 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +16 -0
- gllm_docproc/dpo_router/loader_router.pyi +52 -0
- gllm_docproc/dpo_router/parser_router.pyi +42 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +14 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +30 -0
- gllm_docproc/indexer/graph/__init__.pyi +4 -0
- gllm_docproc/indexer/graph/graph_rag_indexer.pyi +11 -0
- gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi +97 -0
- gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi +79 -0
- gllm_docproc/indexer/vector/__init__.pyi +3 -0
- gllm_docproc/indexer/vector/vector_db_indexer.pyi +53 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/audio/audio_loader.pyi +45 -0
- gllm_docproc/loader/base_loader.pyi +30 -0
- gllm_docproc/loader/csv/__init__.pyi +3 -0
- gllm_docproc/loader/csv/pandas_loader.pyi +53 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +4 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/exception/video_conversion_error.pyi +12 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +65 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +44 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/image/__init__.pyi +3 -0
- gllm_docproc/loader/image/image_loader.pyi +54 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +35 -0
- gllm_docproc/loader/loader_utils.pyi +43 -0
- gllm_docproc/loader/pdf/__init__.pyi +14 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +37 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +47 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +49 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_page_loader.pyi +41 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +35 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +55 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +56 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +77 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/pptx/__init__.pyi +3 -0
- gllm_docproc/loader/pptx/python_pptx_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +55 -0
- gllm_docproc/loader/video/__init__.pyi +3 -0
- gllm_docproc/loader/video/video_loader_utils.pyi +97 -0
- gllm_docproc/loader/video/video_transcript_loader.pyi +59 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +36 -0
- gllm_docproc/model/__init__.pyi +7 -0
- gllm_docproc/model/element.pyi +38 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/model/loader_type.pyi +20 -0
- gllm_docproc/model/media.pyi +51 -0
- gllm_docproc/model/parser_type.pyi +19 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +28 -0
- gllm_docproc/parser/document/__init__.pyi +7 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/pptx_parser.pyi +34 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/image/__init__.pyi +4 -0
- gllm_docproc/parser/image/image_mime_normalization_parser.pyi +43 -0
- gllm_docproc/parser/image/image_plain_small_filter_parser.pyi +45 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +16 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +38 -0
- gllm_docproc/utils/__init__.pyi +3 -0
- gllm_docproc/utils/async_utils.pyi +22 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +122 -0
- gllm_docproc/validator/__init__.pyi +6 -0
- gllm_docproc/validator/base_validator.pyi +34 -0
- gllm_docproc/validator/character_count_validator.pyi +26 -0
- gllm_docproc/validator/file_size_validator.pyi +20 -0
- gllm_docproc/validator/model/__init__.pyi +4 -0
- gllm_docproc/validator/model/validator_input.pyi +50 -0
- gllm_docproc/validator/model/validator_result.pyi +19 -0
- gllm_docproc/validator/page_count_validator.pyi +23 -0
- gllm_docproc/validator/pipeline_validator.pyi +40 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cp311-win_amd64.pyd +0 -0
- gllm_docproc.pyi +220 -0
- gllm_docproc_binary-0.7.22.dist-info/METADATA +216 -0
- gllm_docproc_binary-0.7.22.dist-info/RECORD +167 -0
- gllm_docproc_binary-0.7.22.dist-info/WHEEL +5 -0
- gllm_docproc_binary-0.7.22.dist-info/top_level.txt +1 -0
gllm_docproc.pyi
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# This file was generated by Nuitka
|
|
2
|
+
|
|
3
|
+
# Stubs included by default
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__name__ = ...
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Modules used internally, to allow implicit dependencies to be seen:
|
|
11
|
+
import os
|
|
12
|
+
import abc
|
|
13
|
+
import typing
|
|
14
|
+
import hashlib
|
|
15
|
+
import inspect
|
|
16
|
+
import re
|
|
17
|
+
import langchain_text_splitters
|
|
18
|
+
import gllm_docproc.chunker.table.TableChunker
|
|
19
|
+
import pandas
|
|
20
|
+
import asyncio
|
|
21
|
+
import concurrent
|
|
22
|
+
import concurrent.futures
|
|
23
|
+
import concurrent.futures.ThreadPoolExecutor
|
|
24
|
+
import gllm_multimodal
|
|
25
|
+
import gllm_multimodal.constants
|
|
26
|
+
import gllm_multimodal.modality_converter
|
|
27
|
+
import gllm_multimodal.modality_converter.image_to_text
|
|
28
|
+
import gllm_multimodal.modality_converter.image_to_text.image_to_caption
|
|
29
|
+
import gllm_multimodal.modality_converter.image_to_text.image_to_caption.image_to_caption
|
|
30
|
+
import json
|
|
31
|
+
import gllm_core
|
|
32
|
+
import gllm_core.utils
|
|
33
|
+
import gllm_core.utils.logger_manager
|
|
34
|
+
import gllm_core.utils.retry
|
|
35
|
+
import gllm_inference
|
|
36
|
+
import gllm_inference.builder
|
|
37
|
+
import gllm_inference.output_parser
|
|
38
|
+
import gllm_inference.prompt_builder
|
|
39
|
+
import gllm_multimodal.modality_converter.image_to_text.image_to_caption.preset_image_to_caption
|
|
40
|
+
import gllm_privacy
|
|
41
|
+
import gllm_privacy.pii_detector
|
|
42
|
+
import gllm_privacy.pii_detector.text_analyzer
|
|
43
|
+
import gllm_privacy.pii_detector.text_anonymizer
|
|
44
|
+
import langdetect
|
|
45
|
+
import mimetypes
|
|
46
|
+
import time
|
|
47
|
+
import uuid
|
|
48
|
+
import pathlib
|
|
49
|
+
import magic
|
|
50
|
+
import requests
|
|
51
|
+
import requests.adapters
|
|
52
|
+
import urllib3
|
|
53
|
+
import urllib3.util
|
|
54
|
+
import urllib3.util.retry
|
|
55
|
+
import gllm_docproc.downloader.BaseDownloader
|
|
56
|
+
import ntpath
|
|
57
|
+
import bosa_connectors
|
|
58
|
+
import bosa_connectors.connector
|
|
59
|
+
import bosa_connectors.models
|
|
60
|
+
import bosa_connectors.models.file
|
|
61
|
+
import datetime
|
|
62
|
+
import firecrawl
|
|
63
|
+
import pydantic
|
|
64
|
+
import copy
|
|
65
|
+
import html_to_markdown
|
|
66
|
+
import scrapy
|
|
67
|
+
import gllm_docproc.downloader.html.exception.ItemScrapeFailedException
|
|
68
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlBaseSpider
|
|
69
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapLinkSpider
|
|
70
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapSpider
|
|
71
|
+
import gllm_docproc.downloader.html.utils.generate_filename_from_url
|
|
72
|
+
import gllm_docproc.downloader.html.utils.is_valid_url
|
|
73
|
+
import scrapy.http
|
|
74
|
+
import scrapy_playwright
|
|
75
|
+
import scrapy_playwright.page
|
|
76
|
+
import gllm_docproc.downloader.html.utils.clean_url
|
|
77
|
+
import urllib
|
|
78
|
+
import urllib.parse
|
|
79
|
+
import scrapy.crawler
|
|
80
|
+
import scrapy.spiders
|
|
81
|
+
import scrapy.spiders.sitemap
|
|
82
|
+
import scrapy.utils
|
|
83
|
+
import scrapy.utils.sitemap
|
|
84
|
+
import scrapy.linkextractors
|
|
85
|
+
import billiard
|
|
86
|
+
import gllm_docproc.downloader.html.exception.ZyteApiKeyNotProvidedException
|
|
87
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.ScrapeSpider
|
|
88
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.ZyteScrapeSpider
|
|
89
|
+
import gllm_multimodal.utils
|
|
90
|
+
import gllm_multimodal.utils.audio_to_text_utils
|
|
91
|
+
import gllm_docproc.loader.image.ImageLoader
|
|
92
|
+
import gllm_docproc.loader.txt.TXTLoader
|
|
93
|
+
import gllm_docproc.model.Element
|
|
94
|
+
import gllm_docproc.model.LoaderType
|
|
95
|
+
import _frozen_importlib_external
|
|
96
|
+
import gllm_docproc.indexer.BaseIndexer
|
|
97
|
+
import gllm_datastore
|
|
98
|
+
import gllm_datastore.graph_data_store
|
|
99
|
+
import gllm_datastore.graph_data_store.light_rag_data_store
|
|
100
|
+
import lightrag
|
|
101
|
+
import lightrag.lightrag
|
|
102
|
+
import gllm_datastore.graph_data_store.llama_index_graph_rag_data_store
|
|
103
|
+
import gllm_datastore.graph_data_store.llama_index_neo4j_graph_rag_data_store
|
|
104
|
+
import llama_index
|
|
105
|
+
import llama_index.core
|
|
106
|
+
import llama_index.core.base
|
|
107
|
+
import llama_index.core.base.embeddings
|
|
108
|
+
import llama_index.core.base.embeddings.base
|
|
109
|
+
import llama_index.core.base.llms
|
|
110
|
+
import llama_index.core.base.llms.base
|
|
111
|
+
import llama_index.core.indices
|
|
112
|
+
import llama_index.core.indices.property_graph
|
|
113
|
+
import llama_index.core.indices.property_graph.transformations
|
|
114
|
+
import llama_index.core.schema
|
|
115
|
+
import llama_index.core.vector_stores
|
|
116
|
+
import llama_index.core.vector_stores.types
|
|
117
|
+
import __future__
|
|
118
|
+
import gllm_core.schema
|
|
119
|
+
import gllm_core.utils.concurrency
|
|
120
|
+
import gllm_datastore.core
|
|
121
|
+
import gllm_datastore.core.capabilities
|
|
122
|
+
import gllm_inference.schema
|
|
123
|
+
import tqdm
|
|
124
|
+
import gllm_multimodal.modality_converter.audio_to_text
|
|
125
|
+
import gllm_multimodal.modality_converter.audio_to_text.audio_to_text
|
|
126
|
+
import gllm_multimodal.modality_converter.schema
|
|
127
|
+
import csv
|
|
128
|
+
import base64
|
|
129
|
+
import docx2python
|
|
130
|
+
import docx2python.docx_output
|
|
131
|
+
import docx
|
|
132
|
+
import docx.table
|
|
133
|
+
import docx.text
|
|
134
|
+
import docx.text.paragraph
|
|
135
|
+
import gllm_docproc.loader.html.flat.HTMLFlatLoader
|
|
136
|
+
import gllm_docproc.loader.html.nested.HTMLNestedLoader
|
|
137
|
+
import parsel
|
|
138
|
+
import cairosvg
|
|
139
|
+
import gllm_docproc.loader.html.exception.HtmlLoadException
|
|
140
|
+
import tabulate
|
|
141
|
+
import itertools
|
|
142
|
+
import re.sub
|
|
143
|
+
import w3lib
|
|
144
|
+
import w3lib.html
|
|
145
|
+
import io
|
|
146
|
+
import PIL
|
|
147
|
+
import gllm_docproc.loader.exception.UnsupportedFileExtensionError
|
|
148
|
+
import zipfile
|
|
149
|
+
import adobe
|
|
150
|
+
import adobe.pdfservices
|
|
151
|
+
import adobe.pdfservices.operation
|
|
152
|
+
import adobe.pdfservices.operation.auth
|
|
153
|
+
import adobe.pdfservices.operation.auth.service_principal_credentials
|
|
154
|
+
import adobe.pdfservices.operation.io
|
|
155
|
+
import adobe.pdfservices.operation.io.cloud_asset
|
|
156
|
+
import adobe.pdfservices.operation.io.stream_asset
|
|
157
|
+
import adobe.pdfservices.operation.pdf_services
|
|
158
|
+
import adobe.pdfservices.operation.pdf_services_media_type
|
|
159
|
+
import adobe.pdfservices.operation.pdf_services_response
|
|
160
|
+
import adobe.pdfservices.operation.pdfjobs
|
|
161
|
+
import adobe.pdfservices.operation.pdfjobs.jobs
|
|
162
|
+
import adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job
|
|
163
|
+
import adobe.pdfservices.operation.pdfjobs.params
|
|
164
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf
|
|
165
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type
|
|
166
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params
|
|
167
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_renditions_element_type
|
|
168
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.table_structure_type
|
|
169
|
+
import adobe.pdfservices.operation.pdfjobs.result
|
|
170
|
+
import adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result
|
|
171
|
+
import fitz
|
|
172
|
+
import azure
|
|
173
|
+
import azure.ai
|
|
174
|
+
import azure.ai.documentintelligence
|
|
175
|
+
import azure.ai.documentintelligence.models
|
|
176
|
+
import azure.core
|
|
177
|
+
import azure.core.credentials
|
|
178
|
+
import collections
|
|
179
|
+
import collections.Counter
|
|
180
|
+
import pdfminer
|
|
181
|
+
import pdfminer.high_level
|
|
182
|
+
import pdfminer.layout
|
|
183
|
+
import gllm_docproc.loader.BaseLoader
|
|
184
|
+
import pdfplumber
|
|
185
|
+
import pdfplumber._typing
|
|
186
|
+
import pdfplumber.page
|
|
187
|
+
import pdfplumber.table
|
|
188
|
+
import math
|
|
189
|
+
import numpy
|
|
190
|
+
import tabula
|
|
191
|
+
import tabula.io
|
|
192
|
+
import gllm_datastore.cache
|
|
193
|
+
import gllm_datastore.cache.hybrid_cache
|
|
194
|
+
import gllm_datastore.cache.hybrid_cache.hybrid_cache
|
|
195
|
+
import gllm_datastore.cache.hybrid_cache.utils
|
|
196
|
+
import pptx
|
|
197
|
+
import pptx.chart
|
|
198
|
+
import pptx.chart.chart
|
|
199
|
+
import pptx.shapes
|
|
200
|
+
import pptx.shapes.base
|
|
201
|
+
import pptx.table
|
|
202
|
+
import sys
|
|
203
|
+
import soundfile
|
|
204
|
+
import scipy
|
|
205
|
+
import gllm_docproc.loader.exception.VideoConversionError
|
|
206
|
+
import gi
|
|
207
|
+
import gi.repository
|
|
208
|
+
import gllm_docproc.utils.run_async_in_sync
|
|
209
|
+
import openpyxl
|
|
210
|
+
import openpyxl.cell
|
|
211
|
+
import openpyxl.cell.cell
|
|
212
|
+
import openpyxl.worksheet
|
|
213
|
+
import openpyxl.worksheet.worksheet
|
|
214
|
+
import enum
|
|
215
|
+
import gllm_docproc.parser.BaseParser
|
|
216
|
+
import subprocess
|
|
217
|
+
import tempfile
|
|
218
|
+
import gllm_multimodal.utils.image_utils
|
|
219
|
+
import codecs
|
|
220
|
+
import types
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: gllm-docproc-binary
|
|
3
|
+
Version: 0.7.22
|
|
4
|
+
Summary: A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
|
|
5
|
+
Author-email: GenAI SDK Team <gat-sdk@gdplabs.id>
|
|
6
|
+
Requires-Python: <3.13,>=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: bosa-connectors-binary<0.4.0,>=0.3.0
|
|
9
|
+
Requires-Dist: gllm-core-binary<0.4.0,>=0.3.0
|
|
10
|
+
Requires-Dist: gllm-datastore-binary[chroma,elasticsearch]<0.6.0,>=0.5.0
|
|
11
|
+
Requires-Dist: gllm-multimodal-binary[audio]<0.3.0,>=0.2.0
|
|
12
|
+
Requires-Dist: gllm-privacy-binary<0.5.0,>=0.4.0
|
|
13
|
+
Requires-Dist: langchain-text-splitters<0.4.0,>=0.3.2
|
|
14
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3
|
|
15
|
+
Requires-Dist: pydantic<3.0.0,>=2.9.1
|
|
16
|
+
Requires-Dist: tabulate<0.10.0,>=0.9.0
|
|
17
|
+
Requires-Dist: python-magic<0.5.0,>=0.4.27; sys_platform != "win32"
|
|
18
|
+
Requires-Dist: python-magic-bin<0.5.0,>=0.4.14; sys_platform == "win32"
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: coverage<8.0.0,>=7.4.4; extra == "dev"
|
|
21
|
+
Requires-Dist: mypy<2.0.0,>=1.15.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pre-commit<4.0.0,>=3.7.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest<9.0.0,>=8.1.1; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest-asyncio<1.0.0,>=0.23.6; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-cov<6.0.0,>=5.0.0; extra == "dev"
|
|
26
|
+
Requires-Dist: ruff<1.0.0,>=0.6.7; extra == "dev"
|
|
27
|
+
Provides-Extra: audio
|
|
28
|
+
Requires-Dist: librosa<0.11.0,>=0.10.1; extra == "audio"
|
|
29
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2; extra == "audio"
|
|
30
|
+
Provides-Extra: docx
|
|
31
|
+
Requires-Dist: docx2python<3.0.0,>=2.8.0; extra == "docx"
|
|
32
|
+
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == "docx"
|
|
33
|
+
Provides-Extra: html
|
|
34
|
+
Requires-Dist: billiard<5.0.0,>=4.2.1; extra == "html"
|
|
35
|
+
Requires-Dist: firecrawl-py<5.0.0,>=4.3.6; extra == "html"
|
|
36
|
+
Requires-Dist: html-to-markdown<2.0.0,>=1.9.0; extra == "html"
|
|
37
|
+
Requires-Dist: playwright<2.0.0,>=1.40.0; extra == "html"
|
|
38
|
+
Requires-Dist: scrapy<3.0.0,>=2.11.0; extra == "html"
|
|
39
|
+
Requires-Dist: scrapy-playwright<0.1.0,>=0.0.33; extra == "html"
|
|
40
|
+
Requires-Dist: scrapy-zyte-api<1.0.0,>=0.12.2; extra == "html"
|
|
41
|
+
Requires-Dist: zyte-api<1.0.0,>=0.4.8; extra == "html"
|
|
42
|
+
Provides-Extra: html-svg
|
|
43
|
+
Requires-Dist: cairosvg<3.0.0,>=2.8.2; extra == "html-svg"
|
|
44
|
+
Provides-Extra: image
|
|
45
|
+
Requires-Dist: aioresponses<1.0.0,>=0.7.0; extra == "image"
|
|
46
|
+
Requires-Dist: boto3<2.0.0,>=1.38.10; extra == "image"
|
|
47
|
+
Requires-Dist: pillow<12.0.0,>=11.2.1; extra == "image"
|
|
48
|
+
Provides-Extra: kg
|
|
49
|
+
Requires-Dist: asyncpg<1.0.0,>=0.30.0; extra == "kg"
|
|
50
|
+
Requires-Dist: gllm-datastore-binary[kg]<0.6.0,>=0.5.0; extra == "kg"
|
|
51
|
+
Requires-Dist: lightrag-hku<2.0.0,>=1.4.6; extra == "kg"
|
|
52
|
+
Requires-Dist: llama-index-embeddings-openai<1.0.0,>=0.3.0; extra == "kg"
|
|
53
|
+
Requires-Dist: llama-index-llms-openai<1.0.0,>=0.3.0; extra == "kg"
|
|
54
|
+
Provides-Extra: pdf
|
|
55
|
+
Requires-Dist: azure-ai-documentintelligence<2.0.0,>=1.0.0b3; extra == "pdf"
|
|
56
|
+
Requires-Dist: jpype1<2.0.0,>=1.5.0; extra == "pdf"
|
|
57
|
+
Requires-Dist: pdfminer-six<20250000,>=20231228; extra == "pdf"
|
|
58
|
+
Requires-Dist: pdfplumber<1.0.0,>=0.11.4; extra == "pdf"
|
|
59
|
+
Requires-Dist: pdfservices-sdk<5.0.0,>=4.0.0; extra == "pdf"
|
|
60
|
+
Requires-Dist: pymupdf<2.0.0,>=1.24.10; extra == "pdf"
|
|
61
|
+
Requires-Dist: tabula-py<3.0.0,>=2.9.3; extra == "pdf"
|
|
62
|
+
Provides-Extra: pii
|
|
63
|
+
Requires-Dist: langdetect<2.0.0,>=1.0.0; extra == "pii"
|
|
64
|
+
Requires-Dist: torch<3.0.0,>=2.0.0; extra == "pii"
|
|
65
|
+
Provides-Extra: pptx
|
|
66
|
+
Requires-Dist: python-pptx<2.0.0,>=1.0.2; extra == "pptx"
|
|
67
|
+
Provides-Extra: video
|
|
68
|
+
Requires-Dist: PyGObject==3.50.0; sys_platform != "win32" and extra == "video"
|
|
69
|
+
Requires-Dist: numpy<2.0.0,>=1.26.0; extra == "video"
|
|
70
|
+
Requires-Dist: scipy<2.0.0,>=1.15.0; extra == "video"
|
|
71
|
+
Requires-Dist: soundfile<0.14.0,>=0.13.1; extra == "video"
|
|
72
|
+
Provides-Extra: xlsx
|
|
73
|
+
Requires-Dist: openpyxl<4.0.0,>=3.0.10; extra == "xlsx"
|
|
74
|
+
|
|
75
|
+
# GLLM Docproc
|
|
76
|
+
|
|
77
|
+
## Description
|
|
78
|
+
A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
### Prerequisites
|
|
85
|
+
|
|
86
|
+
Mandatory:
|
|
87
|
+
1. Python 3.11+ — [Install here](https://www.python.org/downloads/)
|
|
88
|
+
2. pip — [Install here](https://pip.pypa.io/en/stable/installation/)
|
|
89
|
+
3. uv — [Install here](https://docs.astral.sh/uv/getting-started/installation/)
|
|
90
|
+
4. gcloud CLI (for authentication) — [Install here](https://cloud.google.com/sdk/docs/install), then log in using:
|
|
91
|
+
```bash
|
|
92
|
+
gcloud auth login
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
### Install from Artifact Registry
|
|
98
|
+
|
|
99
|
+
This requires authentication via the `gcloud` CLI.
|
|
100
|
+
|
|
101
|
+
1. Export token
|
|
102
|
+
```
|
|
103
|
+
export GCLOUD_ACCESS_TOKEN="$(gcloud auth print-access-token)"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
2. Configure the index in your `pyproject.tom;`
|
|
107
|
+
```
|
|
108
|
+
[[tool.uv.index]]
|
|
109
|
+
name = "gen-ai-internal"
|
|
110
|
+
url = "https://oauth2accesstoken:${GCLOUD_ACCESS_TOKEN}@glsdk.gdplabs.id/gen-ai-internal/simple/"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
3. Add the dependency
|
|
114
|
+
```
|
|
115
|
+
uv add gllm-docproc
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Local Development Setup
|
|
121
|
+
|
|
122
|
+
### Prerequisites
|
|
123
|
+
|
|
124
|
+
1. Python 3.11+ — [Install here](https://www.python.org/downloads/)
|
|
125
|
+
2. pip — [Install here](https://pip.pypa.io/en/stable/installation/)
|
|
126
|
+
3. uv — [Install here](https://docs.astral.sh/uv/getting-started/installation/)
|
|
127
|
+
4. gcloud CLI — [Install here](https://cloud.google.com/sdk/docs/install), then log in using:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
gcloud auth login
|
|
131
|
+
```
|
|
132
|
+
5. Git — [Install here](https://git-scm.com/downloads)
|
|
133
|
+
6. Access to the [GDP Labs SDK GitHub repository](https://github.com/GDP-ADMIN/gl-sdk)
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
### 1. Clone Repository
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
git clone git@github.com:GDP-ADMIN/gl-sdk.git
|
|
141
|
+
cd gl-sdk/libs/gllm-docproc
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
### 2. Setup Authentication
|
|
147
|
+
|
|
148
|
+
Set the following environment variables to authenticate with internal package indexes:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
export UV_INDEX_GEN_AI_INTERNAL_USERNAME=oauth2accesstoken
|
|
152
|
+
export UV_INDEX_GEN_AI_INTERNAL_PASSWORD="$(gcloud auth print-access-token)"
|
|
153
|
+
export UV_INDEX_GEN_AI_USERNAME=oauth2accesstoken
|
|
154
|
+
export UV_INDEX_GEN_AI_PASSWORD="$(gcloud auth print-access-token)"
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
### 3. Quick Setup
|
|
160
|
+
|
|
161
|
+
Run:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
make setup
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
### 4. Activate Virtual Environment
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
source .venv/bin/activate
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Local Development Utilities
|
|
178
|
+
|
|
179
|
+
The following Makefile commands are available for quick operations:
|
|
180
|
+
|
|
181
|
+
### Install uv
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
make install-uv
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Install Pre-Commit
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
make install-pre-commit
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Install Dependencies
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
make install
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Update Dependencies
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
make update
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Run Tests
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
make test
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Contributing
|
|
214
|
+
|
|
215
|
+
Please refer to the [Python Style Guide](https://docs.google.com/document/d/1uRggCrHnVfDPBnG641FyQBwUwLoFw0kTzNqRm92vUwM/edit?usp=sharing)
|
|
216
|
+
for information about code style, documentation standards, and SCA requirements.
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
gllm_docproc.cp311-win_amd64.pyd,sha256=VFQvsLfgeK22HGeCw0e79bm6W4_Ds3yY8WhXTLb-QsI,4417024
|
|
2
|
+
gllm_docproc.pyi,sha256=MyO85LevGsOhcS3HJUvc_72LpQP_UBa3dgl5h7VjpVY,7100
|
|
3
|
+
gllm_docproc/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
gllm_docproc/chunker/__init__.pyi,sha256=GOOIYg0-Fjd3g9uJDo9q8J0Gabwt_GHD_44axN6Y-qc,83
|
|
5
|
+
gllm_docproc/chunker/base_chunker.pyi,sha256=1sOrmm0vHwog08QolTqvW6bv5AJ7Wi0Mg9R_e_A8Enc,1160
|
|
6
|
+
gllm_docproc/chunker/structured_element/__init__.pyi,sha256=0SzEj-OALKTVr4v4WgKJVZUnemuzMkhunZndFhPlR4w,136
|
|
7
|
+
gllm_docproc/chunker/structured_element/chunk_enricher.pyi,sha256=Va5FJQEmZu1L88P1b846CUMNyzSzRrQ-cOUTOR4bzg8,1763
|
|
8
|
+
gllm_docproc/chunker/structured_element/structured_element_chunker.pyi,sha256=b8mxV29ozJGKVPTWTpCa6KXE1rTavZlL350EpFR1wYI,4157
|
|
9
|
+
gllm_docproc/chunker/table/__init__.pyi,sha256=HP844LD8YE2VZWqiYLFEOYIM3qIHggqHm5BCAn15xX4,162
|
|
10
|
+
gllm_docproc/chunker/table/table_chunker.pyi,sha256=EqqDL_l9olB81p3x_EG-bjNj2-RizHrLmOA5VVwQf0o,1898
|
|
11
|
+
gllm_docproc/converter/__init__.pyi,sha256=jqqxJRyzpYAPcH6HaFjeuVTGAoxgEvMSOYc0SR2iy6c,91
|
|
12
|
+
gllm_docproc/converter/base_converter.pyi,sha256=lm9KeWtmf61oSU2tQDFKsrBuDKQc5ZMLC80Lz0VZMPM,442
|
|
13
|
+
gllm_docproc/data_generator/__init__.pyi,sha256=1yHopJTd6IRClBEaS0H-9hKJtFIy8dqCpbQDYh2vl70,527
|
|
14
|
+
gllm_docproc/data_generator/base_data_generator.pyi,sha256=y35Gs4-U1WLi7KlTBbMQ0ZQ8PZ-cMShvnjUgPaBgYPo,790
|
|
15
|
+
gllm_docproc/data_generator/image_data_generator/__init__.pyi,sha256=uyBXhMl9V81F2IAWzoXMzheYZNkqqLPRSsFb3PVhvrQ,406
|
|
16
|
+
gllm_docproc/data_generator/image_data_generator/image_caption_data_generator.pyi,sha256=RDncQDpFlJFHgEQU9PftlPEmdI-es2-jAaMQhbjP8Sg,2160
|
|
17
|
+
gllm_docproc/data_generator/image_data_generator/multi_model_image_caption_data_generator.pyi,sha256=wNNcudgmSLAuRqPH-0yCfcrUHH3qvcZ0orHrmxMv3B8,2771
|
|
18
|
+
gllm_docproc/data_generator/pii_data_generator/__init__.pyi,sha256=oNDmiHWIVo9IqZrmwj3TjFlJ3fn-ajHoxILrMBjY248,123
|
|
19
|
+
gllm_docproc/downloader/__init__.pyi,sha256=P67jUf_2Odq5xN_hy-XVrrw_hkd3Oqb_g7ygsgmXdlo,324
|
|
20
|
+
gllm_docproc/downloader/base_downloader.pyi,sha256=BNHN_JxZYa6lIiKrPC1It8KDS5P5XWi-IUs1xK9fvkY,832
|
|
21
|
+
gllm_docproc/downloader/direct_file_url_downloader.pyi,sha256=XEE_ZkNZd8DPB-c4gn7SJL4g9XWcqxE3vrhcB3KIFEk,1862
|
|
22
|
+
gllm_docproc/downloader/google_drive_downloader.pyi,sha256=j20WE_p5Cdf99qXvaWlkqi66N1bm_h1HlEoVPlgqTc0,1589
|
|
23
|
+
gllm_docproc/downloader/html/__init__.pyi,sha256=XxbGV-dz6ByYk91vXbu6UqSdtaLDuTpdrBHa6bKYjpM,344
|
|
24
|
+
gllm_docproc/downloader/html/firecrawl_downloader.pyi,sha256=Et44c2afB54d3iHW5CXgUHkFtEAGMtA-_0KR71VTyPI,2472
|
|
25
|
+
gllm_docproc/downloader/html/html_downloader.pyi,sha256=ossOxNEN4O10Mcug0oDTktC9DXZf4Mqu1FEsB00E15Q,5965
|
|
26
|
+
gllm_docproc/downloader/html/requests_downloader.pyi,sha256=1mWBhGlYArNmB4zS4v_7-j1kYEg8dHXXrUcWZ8BBeJo,2401
|
|
27
|
+
gllm_docproc/downloader/html/exception/__init__.pyi,sha256=Sx4tSQh97yLWL5dSd6ZtdUiNEpoMpupTVDvJWHVjA9g,290
|
|
28
|
+
gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi,sha256=PB0Uurm2v3anDds1eV9IGnw8fzuTjAZNc_E2sl_LrWM,623
|
|
29
|
+
gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi,sha256=UD1WzdOLnm4pWgu3BSVen9Hp41FzZ_9W_8eDF9FfDJQ,604
|
|
30
|
+
gllm_docproc/downloader/html/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi,sha256=3ssFcaAgxl07a0zVoioQfPIOgXhkQVe_isisclwwztk,2155
|
|
32
|
+
gllm_docproc/downloader/html/scraper/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi,sha256=2FI0gNdvv8PDRTjfEr_1xZ1PNhufmw2i_T1PhpZReOM,667
|
|
34
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi,sha256=GDx4ViQhGi0ewHpei2bseDQoPdHM1NZJBrC-NXzjopg,1366
|
|
35
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi,sha256=HrB-wf9ERVL95cFp9kUupsV9JdfY5SP-FFAmW1TqAoY,1148
|
|
36
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi,sha256=qc32owSa6u7vtfzIrqooCRhAo-cmNfkKiD8xnSeKb50,2605
|
|
37
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi,sha256=Nfo-6elB52U3rC_osCnuc2Oedyqs4w0-Fc_QHcitAjE,2633
|
|
38
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi,sha256=YuWJJIhYfA53Z2_GJCoRjOVUdTr7uw-9iEtzhVg4pUQ,969
|
|
39
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi,sha256=G-EZweoO5NjjicMdq5yKLMIHhk1vMrYoA1zzIydLUtA,2139
|
|
40
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi,sha256=psibiVfIauedH_yq814QlXFegvWrymLQpskwFGBkyog,2189
|
|
41
|
+
gllm_docproc/downloader/html/utils/__init__.pyi,sha256=tPm7b-zPyJLbBYKxg5apjGXiJT3vHfS4WqOJ6Cd3A6E,208
|
|
42
|
+
gllm_docproc/downloader/html/utils/web_utils.pyi,sha256=FBEblpXsqC0pR6nRpsU3ziZcUXNcYHpP9FkUA8AnHAk,1434
|
|
43
|
+
gllm_docproc/dpo_router/__init__.pyi,sha256=PZ2ZjotPyu5y0to7O02t3PocsQqIFTtnPJFBJpnuHvM,238
|
|
44
|
+
gllm_docproc/dpo_router/base_dpo_router.pyi,sha256=bwSj4_EHQWVcOghZoidCrn4OBiUsZM6NQpLS_cDHkCI,595
|
|
45
|
+
gllm_docproc/dpo_router/loader_router.pyi,sha256=8uEqkI7tEjixPkj8SXR_iDQgFhQV4CeeHeabzu-VLXw,2411
|
|
46
|
+
gllm_docproc/dpo_router/parser_router.pyi,sha256=8rV1ImqjvKpmdTaUGfgpcq3-r188kh_-jLKPQVca9TE,1975
|
|
47
|
+
gllm_docproc/housekeeping/__init__.pyi,sha256=oL-C1roDf8io8zh90D4ugD_jAXdB_1CCcggZB1b5sTs,103
|
|
48
|
+
gllm_docproc/housekeeping/base_housekeeping.pyi,sha256=jV3Y5iDsU5biIjm_NJMThUeIK5g77fB3OkRv3dsfEiA,430
|
|
49
|
+
gllm_docproc/indexer/__init__.pyi,sha256=DAX3M09IFTLRzUmyd4XL0a6jhnZmE8UEPxZ-5dTlSt8,103
|
|
50
|
+
gllm_docproc/indexer/base_indexer.pyi,sha256=ITBr186WQb-dNRw6jTEyQDC0wbB7455hp9TtZWR4lP8,1082
|
|
51
|
+
gllm_docproc/indexer/graph/__init__.pyi,sha256=5dcxr1z5CpUJE5Pk9NhI1doLTUk3LbC-GvrOi0GQQ0s,314
|
|
52
|
+
gllm_docproc/indexer/graph/graph_rag_indexer.pyi,sha256=EbzrO4YA09eq2UY4Xyjr2WRwFaIncRYnsepNk3Vcuvg,379
|
|
53
|
+
gllm_docproc/indexer/graph/light_rag_graph_rag_indexer.pyi,sha256=GD0GOykrAHGRJJGI858MA0tYXPnWYTUK0urSbDrhaOY,4288
|
|
54
|
+
gllm_docproc/indexer/graph/llama_index_graph_rag_indexer.pyi,sha256=HlK2bSA-JUHD5wkOlRHIHEChz4RH3xkr8amM7ViBlqY,4897
|
|
55
|
+
gllm_docproc/indexer/vector/__init__.pyi,sha256=st34Q3GoXyMyic0v09arzV7gscH2yPEIp7z-5wWLOkE,100
|
|
56
|
+
gllm_docproc/indexer/vector/vector_db_indexer.pyi,sha256=TH_KL0wgZ76XjOdOd-gZstaKpb2tKySxETkBKNDMO8o,2413
|
|
57
|
+
gllm_docproc/loader/__init__.pyi,sha256=Lzsi_ajlFYzu2teZ2kKiu7HHRHz7-M5ubpJ7XRmZHcg,160
|
|
58
|
+
gllm_docproc/loader/base_loader.pyi,sha256=Bzo7h7czTHcbBg6m85r1T2_rWlcQYOgDaF1EUbI3bOs,1352
|
|
59
|
+
gllm_docproc/loader/loader_utils.pyi,sha256=xt0nyJR3I8t8_lxu-ptJmy38sNhUITnh9kUQISeFLXQ,1871
|
|
60
|
+
gllm_docproc/loader/pipeline_loader.pyi,sha256=xhpvW6EU7mpPHYddrsuSyG3v_981B5Q05Je8ujoexuw,1819
|
|
61
|
+
gllm_docproc/loader/audio/__init__.pyi,sha256=bJMhxunQuKPnJzjXiIkJTtqdaefxb9f33Xw2QB2Wco0,83
|
|
62
|
+
gllm_docproc/loader/audio/audio_loader.pyi,sha256=LHtSs9W78VJ32qStc7lRsFBhwgP3mdWbILwl1n5INRU,2288
|
|
63
|
+
gllm_docproc/loader/csv/__init__.pyi,sha256=Wr2rgvI9f3_yF7Wc6fhN8cDo5RCqTz_GTbJjpJsIivA,87
|
|
64
|
+
gllm_docproc/loader/csv/pandas_loader.pyi,sha256=YNlkJ0scQFUFcV62aMIh6XbUdw96UYZ8NhhVdZROkWI,2861
|
|
65
|
+
gllm_docproc/loader/docx/__init__.pyi,sha256=5JNIvnZyO9ZOMHJHSemAsFafwAf20iUxAiMDnR1U17I,308
|
|
66
|
+
gllm_docproc/loader/docx/docx2python_loader.pyi,sha256=l3HND1hsLBRQ6j_OobH5Y6_rQlkGTXe6KD3DZabaXHI,2569
|
|
67
|
+
gllm_docproc/loader/docx/python_docx_loader.pyi,sha256=3ulT68shiwzK1dOopdgZKWcUTAuMcGhZIrJjwEhQC6M,2051
|
|
68
|
+
gllm_docproc/loader/docx/python_docx_table_loader.pyi,sha256=tQp_Fypis_pz_3VZddrpujV4abzX1xtoXOaDFJSe3R4,1756
|
|
69
|
+
gllm_docproc/loader/exception/__init__.pyi,sha256=5mgwCzDO-ZPFFmpuethHJOMKvT9bSR5DRLTaqoeaaHM,263
|
|
70
|
+
gllm_docproc/loader/exception/unsupported_file_extension_error.pyi,sha256=c5xVjuQPNYZSuvlCHi87KMfrhBK-4R_i7-wr69QdUvQ,268
|
|
71
|
+
gllm_docproc/loader/exception/video_conversion_error.pyi,sha256=SKbp7V8qXuOpPTbUfYOnAn6LcXoGaqpLzN14Lm2aFkc,442
|
|
72
|
+
gllm_docproc/loader/html/__init__.pyi,sha256=hzLamxCb2AwLI8xO9ty0f-qY0kD97iAXCwUfBJyxkYw,244
|
|
73
|
+
gllm_docproc/loader/html/html_base_loader.pyi,sha256=zPdRd0mMw90Q6pwJNzXWWEgea-7tYyHq0Pv1eSL3QZw,1185
|
|
74
|
+
gllm_docproc/loader/html/exception/__init__.pyi,sha256=4pNokkFZd_UWkgce5hvYUS6f5F_b1xZ_DShwmWakUDA,108
|
|
75
|
+
gllm_docproc/loader/html/exception/html_load_exception.pyi,sha256=R9Was3AEqcUXqqbARpgfQ5rnRS0pTL8gPXimqbWXFbo,261
|
|
76
|
+
gllm_docproc/loader/html/flat/__init__.pyi,sha256=GDjScQBkVstxxPnK4DIc3UE81shLXHjmJxgim7q1oj0,96
|
|
77
|
+
gllm_docproc/loader/html/flat/html_flat_base_handler.pyi,sha256=xeH7Q23LhWFeCVgvbZ757_-zWJileknaUgTiU5wUJwg,3250
|
|
78
|
+
gllm_docproc/loader/html/flat/html_flat_loader.pyi,sha256=g8ZSgVvdKV1JQIhzPER_mkNUv27OmfiwM5LptQNC0_8,1811
|
|
79
|
+
gllm_docproc/loader/html/flat/html_flat_merger.pyi,sha256=q3dxADexBv_yHp3-lmtRIwx5Wf-eotrQiBYaR5Do9nE,1378
|
|
80
|
+
gllm_docproc/loader/html/nested/__init__.pyi,sha256=nG7Z3zV4Z4KIG2MVWgaB7V6LA9LyN3JkgMkWdItA0dA,104
|
|
81
|
+
gllm_docproc/loader/html/nested/dictionary_utils.pyi,sha256=mcREqjBuSZ01idMXHu4Kfqa4aITdb4Mvlp8x7SZHX3U,1534
|
|
82
|
+
gllm_docproc/loader/html/nested/html_nested_base_handler.pyi,sha256=CDxr-Tq0qY5sM8fQujdxDr8YTnY2CJJj1PMeB28HzNQ,4862
|
|
83
|
+
gllm_docproc/loader/html/nested/html_nested_element_handler.pyi,sha256=d-vld6eNj_tjcLNRO2nX41Q_vdlZxoiq4kO2zrBuw2k,940
|
|
84
|
+
gllm_docproc/loader/html/nested/html_nested_loader.pyi,sha256=bOw4p5r4H7qYjY6F6ndDdlkGjjqz7Ze9A47oh19yhwo,932
|
|
85
|
+
gllm_docproc/loader/html/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
86
|
+
gllm_docproc/loader/html/utils/flat_table_utils.pyi,sha256=gYMUixQABTQ-tRsPJPjzAuFErMssvyx7D2NxdDKk25M,1993
|
|
87
|
+
gllm_docproc/loader/html/utils/html_utils.pyi,sha256=T9EC30ChY89B6zvLEf2KI45yoawWZK_ZRRXaqODwaNs,1669
|
|
88
|
+
gllm_docproc/loader/html/utils/removed_components.pyi,sha256=wwpHUfu1HCMNIf8pvS52ibBKx3iAOXpq0TjkFPllGm8,2143
|
|
89
|
+
gllm_docproc/loader/html/utils/string_utils.pyi,sha256=HiKHATp4DGiORMI5WGadVKuNsqHMLVISIJi4MRXQhUs,1125
|
|
90
|
+
gllm_docproc/loader/html/utils/table_utils.pyi,sha256=_6xDInTK0o-XQi6Ylliqijpm8mjGfWjw4rtC_E7XVPY,2818
|
|
91
|
+
gllm_docproc/loader/image/__init__.pyi,sha256=ry_XYHaJlcb2zfTdt4wuiYoEClxgPsWKuGuDYkIQ2gE,83
|
|
92
|
+
gllm_docproc/loader/image/image_loader.pyi,sha256=AG4z5EKerNfn7NefNZFr4Fm9DLiPgm7JXCTUjLWBKjs,2517
|
|
93
|
+
gllm_docproc/loader/json/__init__.pyi,sha256=UwKxlnJQCMBevsVLtEdNqbUwTjSUDAiGVQ1RPCiXQYc,112
|
|
94
|
+
gllm_docproc/loader/json/json_elements_loader.pyi,sha256=1gwS5Osxby09X_3TFbr66_YY221dRonCS8dewwwmSH4,1501
|
|
95
|
+
gllm_docproc/loader/pdf/__init__.pyi,sha256=hNLgphVwW4aZsYmJV4t5zZMvTgVOXuAIn4HveSYAsIg,1290
|
|
96
|
+
gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi,sha256=__nBHGfSc8OXlvk6zchSM_D4Up_x6Z-DaLyCnDBiZww,1728
|
|
97
|
+
gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi,sha256=-_f-jI1GkSQSIXVzrTvAGvg5pjF2vJqwAFHXTDA7_ys,2661
|
|
98
|
+
gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi,sha256=64yifR4-VOtwBF8r3jlCD85pInibdbdWgWbeylbPCzo,2677
|
|
99
|
+
gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi,sha256=WDY5FX7OcINZKU_IZJY7qHrmzW-YQiOLMjWmyZqS6P8,1698
|
|
100
|
+
gllm_docproc/loader/pdf/pdf_loader_utils.pyi,sha256=ztwgYvjM80ESRZ5obioIidGwfm-TPfcC0Qd5nPCdH1Y,2997
|
|
101
|
+
gllm_docproc/loader/pdf/pdf_miner_loader.pyi,sha256=Ssk6kqgzucDpK_ohASfXvg5bK53dm8ktSmhClxjDsAU,2254
|
|
102
|
+
gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi,sha256=vpikwwzR6j26VO3UdY_UqcUavM-TozgxSwSSYTl50HY,1551
|
|
103
|
+
gllm_docproc/loader/pdf/pdf_page_loader.pyi,sha256=nptf8ZTvyD4UQ48A7cgggT7epRqzGsegbDQewDnoRUY,2071
|
|
104
|
+
gllm_docproc/loader/pdf/pdf_plumber_loader.pyi,sha256=eK_1IuYTKJmNpEWP5JQAUQ1I4H_C9sR-uqItECKTlJk,2052
|
|
105
|
+
gllm_docproc/loader/pdf/pymupdf_loader.pyi,sha256=Twko6w42Yficql3jBNnm0nBlxjuehSzUXJ6cq58ydiM,3498
|
|
106
|
+
gllm_docproc/loader/pdf/pymupdf_span_loader.pyi,sha256=srv1fLohpcHVampSs-joNPhilVzYwwA39MgTW-BBgnc,3518
|
|
107
|
+
gllm_docproc/loader/pdf/pymupdf_utils.pyi,sha256=RlleXspnjGcibUxwY13-kCr6d0qJm2uEfNmAqiIlq9c,3678
|
|
108
|
+
gllm_docproc/loader/pdf/tabula_loader.pyi,sha256=zNhW3KVr3YQLyMJQamHXd2B_QQ925r1IqprHiXM42Z8,1788
|
|
109
|
+
gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi,sha256=xRWomeYCvzMca1YeSNQYD8ArDpHDQf9Wz9gn3dTAfX0,2045
|
|
110
|
+
gllm_docproc/loader/pptx/__init__.pyi,sha256=1gb8IxMQiZkkcGZl41rdYEGjxvCvEwfWC4ZwPWyK3x0,104
|
|
111
|
+
gllm_docproc/loader/pptx/python_pptx_loader.pyi,sha256=tGSO_9hQzhFoeDd6HPlRNdERUE85heOAUPmEfS62ETQ,2580
|
|
112
|
+
gllm_docproc/loader/txt/__init__.pyi,sha256=fwhz7Y79UKInwtd_4tnq3sXFH4YdsfE-zjykWnZKmLI,75
|
|
113
|
+
gllm_docproc/loader/txt/txt_loader.pyi,sha256=Wos2IRSktx6_MnU4hFf7IuQg3GWSuGIygWk3Lfev1xM,2274
|
|
114
|
+
gllm_docproc/loader/video/__init__.pyi,sha256=UAwriOtlfSfQu03fZHZnUgTgErN60Hjqp_SONhxHbUY,124
|
|
115
|
+
gllm_docproc/loader/video/video_loader_utils.pyi,sha256=JMqeQTng9MOFjQQ_D3NrfTJBaHWyh5Y6PLq3coIbNJg,4814
|
|
116
|
+
gllm_docproc/loader/video/video_transcript_loader.pyi,sha256=Xs37HQvapUySlIRdgm_mzfUJ4hqtHmp1uXhYu-4Homg,3209
|
|
117
|
+
gllm_docproc/loader/xlsx/__init__.pyi,sha256=bdF9g2QKvO0E_aTBdfbwXKivHTAohdRHBQYMou8Yr2s,95
|
|
118
|
+
gllm_docproc/loader/xlsx/openpyxl_loader.pyi,sha256=ozyTYhzOSiSOqvKmL44f2434eOuwDcNpx5AoXISb0LA,1962
|
|
119
|
+
gllm_docproc/model/__init__.pyi,sha256=J1qIuXV1QS2Sp3mR7JkZZiYcKo0zPhcerxfxdQRijEg,325
|
|
120
|
+
gllm_docproc/model/element.pyi,sha256=Avt92ckAKJUXGI1Mfw6XtPRgjaY2yW8DZJJ-Se_l3CM,1096
|
|
121
|
+
gllm_docproc/model/element_metadata.pyi,sha256=p9WqqfPfRpLZaZIkkAfRVzRvt-8OhaT-gZgCc_tLXZQ,846
|
|
122
|
+
gllm_docproc/model/loader_type.pyi,sha256=YC3BxBLR7fyh1J7BkpmU6ELLfj3kdwJO5PJd7luoGpE,433
|
|
123
|
+
gllm_docproc/model/media.pyi,sha256=ypQC8HVZ9BGHLCGTEolykq_ynryX1YPbnmFxbkiCnYw,1711
|
|
124
|
+
gllm_docproc/model/parser_type.pyi,sha256=7q2T_l_WSr9BYiT5ToZNUr03Ulw04bPiexmkYH4Ji84,402
|
|
125
|
+
gllm_docproc/parser/__init__.pyi,sha256=7ylnmzWFyW4_XTVkxyj9iaIfRfBQu6_d5c-Jj781nlY,160
|
|
126
|
+
gllm_docproc/parser/base_parser.pyi,sha256=Sun6W4Iv92G_tztu5k6xdYmwA96qxPLK6DPoDf2yt_I,1163
|
|
127
|
+
gllm_docproc/parser/pipeline_parser.pyi,sha256=TBVbZQEdU9T-oNE4r9BE0VW1wu_sjy6VPPNVvwMdG9U,1270
|
|
128
|
+
gllm_docproc/parser/document/__init__.pyi,sha256=DgVCz5skXQW_DzQ0pWFpxNvrEuE8ehGPOuK7kAFCYC4,331
|
|
129
|
+
gllm_docproc/parser/document/docx_parser.pyi,sha256=XeMY-pNyEUoUg4xqtc7XMmGGImumWMmzC9C9H4iMTB4,1546
|
|
130
|
+
gllm_docproc/parser/document/pdf_parser.pyi,sha256=ZJvY4TkVsXAZw_NM6vBjqPgYmk__gI3Nkox4-oQCStQ,1546
|
|
131
|
+
gllm_docproc/parser/document/pptx_parser.pyi,sha256=MNROmr0yTvgrDJFh361ZotMOMDvJMMVM2BXrF_vlUhQ,1624
|
|
132
|
+
gllm_docproc/parser/document/txt_parser.pyi,sha256=sdERCHZ_lR6B7Q0oTDKp0oQm55mPVH1M-04fuZWpzkI,885
|
|
133
|
+
gllm_docproc/parser/document/xlsx_parser.pyi,sha256=yylVDHfUm5i_jD0ECjo2KamVxGjsrV2_t86TTsNDgdU,1094
|
|
134
|
+
gllm_docproc/parser/html/__init__.pyi,sha256=DAG6lL1SfvN7euCP0XSq2Bqmai931jx9_Oj8u4LGN7I,198
|
|
135
|
+
gllm_docproc/parser/html/flat/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
|
+
gllm_docproc/parser/html/flat/html_flat_parser.pyi,sha256=v7zbnvnBBg8TTBd4cq5q2PElyePHkKwtUc9B3y1PAGA,1280
|
|
137
|
+
gllm_docproc/parser/html/nested/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
+
gllm_docproc/parser/html/nested/html_json_processor.pyi,sha256=jnEWMBxVxh5f99YCmZk5oqRR4T2qPAYNEfExPMkJ1nc,6477
|
|
139
|
+
gllm_docproc/parser/html/nested/html_nested_parser.pyi,sha256=O3JhTWHIXWKNCl4kfk4lWH1SQRiv3bRbu_OMDSHicFA,1014
|
|
140
|
+
gllm_docproc/parser/html/nested/nested_element.pyi,sha256=-NnyjyGtP6BzQ1XQQNsm66FqL9tt75EYCxNQfh30ZGc,1277
|
|
141
|
+
gllm_docproc/parser/image/__init__.pyi,sha256=MF6rUQ3_xMX5O1ZEE-S1zST3kp-r2eZVyxjO1oSWDy0,339
|
|
142
|
+
gllm_docproc/parser/image/image_mime_normalization_parser.pyi,sha256=crJTks127xM35qkN8f3xwWj-Ha6WYLpxuSHbTGSPpvU,2087
|
|
143
|
+
gllm_docproc/parser/image/image_plain_small_filter_parser.pyi,sha256=4nLq9q6Wl4bCCzu8R5or41uY1_GoUKc2g10GnHfgViQ,1936
|
|
144
|
+
gllm_docproc/parser/table/__init__.pyi,sha256=a2Zvp3tmJat1Kgp7ZWbikqzAviRVGyVLUcS9PGZ4owo,112
|
|
145
|
+
gllm_docproc/parser/table/table_caption_parser.pyi,sha256=bpp_vKDYRPyrveOBIiURYDGh5yBDU04PoenokLlRzRY,2770
|
|
146
|
+
gllm_docproc/request_handler/__init__.pyi,sha256=nulIcWHCBdrJHMbfJGIwfJmDnlcAI76FnRl-0xt5myw,112
|
|
147
|
+
gllm_docproc/request_handler/base_request_handler.pyi,sha256=CC495QmCo-SQR9jXFjJXDEIY_Q9G66RVqC-UNhTRwBE,461
|
|
148
|
+
gllm_docproc/response_handler/__init__.pyi,sha256=JcTvIdJ4heHLgOcB4i6KGKfOSUVhjkGNhUXmxPgelXI,116
|
|
149
|
+
gllm_docproc/response_handler/base_response_handler.pyi,sha256=0eYFF10duSJnuOaQamth2Y7YZMV1rIwouu87DxfaLxg,1287
|
|
150
|
+
gllm_docproc/utils/__init__.pyi,sha256=a8YWoY2dNzmWpAwL_WHSUgOBrtGv3GdmT3ets3H-tX4,100
|
|
151
|
+
gllm_docproc/utils/async_utils.pyi,sha256=KAj9dTiK3SRdEhN4itwPA75KSeFaHHulJX0hObaei3s,659
|
|
152
|
+
gllm_docproc/utils/file_utils.pyi,sha256=ALINcy33v_fJOrRBkN27ehYGQ-APm6dTsSyB9Zi4QXg,2684
|
|
153
|
+
gllm_docproc/utils/html_constants.pyi,sha256=UEQilpS3NdICdoFV1qVJXKNeL67bcdrZ15S3tb0tY2c,2948
|
|
154
|
+
gllm_docproc/validator/__init__.pyi,sha256=HviQKNWg9XDDw2-AnvIu4icYeT5pLYdxNNlfu63l2Js,505
|
|
155
|
+
gllm_docproc/validator/base_validator.pyi,sha256=9JBzNmrS6r9ExIHzF3kp7hPuAvL8Ua9vqrJ5hY7DQ8Q,1619
|
|
156
|
+
gllm_docproc/validator/character_count_validator.pyi,sha256=CGfVHxE3UJ2euT7kx65dqQ-hUHXRly3EMPoDWdRiOyQ,1507
|
|
157
|
+
gllm_docproc/validator/file_size_validator.pyi,sha256=ZvjH2itrF2x-JMU9uUZa4CgZSEO9l14fcN88JT5K8Jk,1216
|
|
158
|
+
gllm_docproc/validator/page_count_validator.pyi,sha256=CbxKHgJgexrRkAtNbV5yvxy__AV5wGuAB56jrdOtTpo,1307
|
|
159
|
+
gllm_docproc/validator/pipeline_validator.pyi,sha256=hOB15RBvIAemRytyfHVLQMvIVmYARmxcYZSs-gkHk60,1930
|
|
160
|
+
gllm_docproc/validator/model/__init__.pyi,sha256=-XgIRV1nvMPkcezSwP-10QOp95GCKheTfC58rY942yw,180
|
|
161
|
+
gllm_docproc/validator/model/validator_input.pyi,sha256=q_0FGTba88oa63lSo5_5Eojoj6gav3Lp_d3YxcUM7gc,1883
|
|
162
|
+
gllm_docproc/validator/model/validator_result.pyi,sha256=I2ZidQAVXjdKm2gXDIyunTYrgdEF-K2HxE7cIX7vUB0,734
|
|
163
|
+
gllm_docproc.build/.gitignore,sha256=aEiIwOuxfzdCmLZe4oB1JsBmCUxwG8x-u-HBCV9JT8E,1
|
|
164
|
+
gllm_docproc_binary-0.7.22.dist-info/METADATA,sha256=0uZeXs1QcWFbyIwZ7BgCmr62WSsTQeEtapFx0LKJkM4,6704
|
|
165
|
+
gllm_docproc_binary-0.7.22.dist-info/WHEEL,sha256=l2aKBREYfqJ7T2ljmr6hUiXPoNvvXF47bG4IHjuSyS4,96
|
|
166
|
+
gllm_docproc_binary-0.7.22.dist-info/top_level.txt,sha256=FzUqfBCCn6DsB0K9QO5mNXrR2VbqKu__KhFzHgHVz90,13
|
|
167
|
+
gllm_docproc_binary-0.7.22.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gllm_docproc
|