gllm-docproc-binary 0.1.8__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-docproc-binary might be problematic. Click here for more details.
- gllm_docproc/__init__.pyi +0 -0
- gllm_docproc/chunker/__init__.pyi +3 -0
- gllm_docproc/chunker/base_chunker.pyi +29 -0
- gllm_docproc/chunker/structured_element/__init__.pyi +3 -0
- gllm_docproc/chunker/structured_element/chunk_enricher.pyi +43 -0
- gllm_docproc/chunker/structured_element/structured_element_chunker.pyi +80 -0
- gllm_docproc/chunker/table/__init__.pyi +3 -0
- gllm_docproc/chunker/table/table_chunker.pyi +45 -0
- gllm_docproc/converter/__init__.pyi +3 -0
- gllm_docproc/converter/base_converter.pyi +16 -0
- gllm_docproc/data_generator/__init__.pyi +3 -0
- gllm_docproc/data_generator/base_data_generator.pyi +19 -0
- gllm_docproc/downloader/__init__.pyi +3 -0
- gllm_docproc/downloader/base_downloader.pyi +16 -0
- gllm_docproc/downloader/html/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/__init__.pyi +4 -0
- gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi +16 -0
- gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi +15 -0
- gllm_docproc/downloader/html/html_downloader.pyi +91 -0
- gllm_docproc/downloader/html/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/__init__.pyi +0 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi +9 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi +27 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi +29 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi +61 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi +50 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi +22 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi +57 -0
- gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi +51 -0
- gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi +45 -0
- gllm_docproc/downloader/html/utils/__init__.pyi +3 -0
- gllm_docproc/downloader/html/utils/web_utils.pyi +21 -0
- gllm_docproc/dpo_router/__init__.pyi +3 -0
- gllm_docproc/dpo_router/base_dpo_router.pyi +17 -0
- gllm_docproc/housekeeping/__init__.pyi +3 -0
- gllm_docproc/housekeeping/base_housekeeping.pyi +15 -0
- gllm_docproc/indexer/__init__.pyi +3 -0
- gllm_docproc/indexer/base_indexer.pyi +31 -0
- gllm_docproc/indexer/graph/__init__.pyi +3 -0
- gllm_docproc/indexer/knowledge_graph/__init__.pyi +4 -0
- gllm_docproc/loader/__init__.pyi +4 -0
- gllm_docproc/loader/audio/__init__.pyi +3 -0
- gllm_docproc/loader/base_loader.pyi +31 -0
- gllm_docproc/loader/docx/__init__.pyi +5 -0
- gllm_docproc/loader/docx/docx2python_loader.pyi +46 -0
- gllm_docproc/loader/docx/python_docx_loader.pyi +35 -0
- gllm_docproc/loader/docx/python_docx_table_loader.pyi +35 -0
- gllm_docproc/loader/exception/__init__.pyi +3 -0
- gllm_docproc/loader/exception/unsupported_file_extension_error.pyi +7 -0
- gllm_docproc/loader/html/__init__.pyi +5 -0
- gllm_docproc/loader/html/exception/__init__.pyi +3 -0
- gllm_docproc/loader/html/exception/html_load_exception.pyi +7 -0
- gllm_docproc/loader/html/flat/__init__.pyi +3 -0
- gllm_docproc/loader/html/flat/html_flat_base_handler.pyi +52 -0
- gllm_docproc/loader/html/flat/html_flat_loader.pyi +30 -0
- gllm_docproc/loader/html/flat/html_flat_merger.pyi +22 -0
- gllm_docproc/loader/html/html_base_loader.pyi +25 -0
- gllm_docproc/loader/html/nested/__init__.pyi +3 -0
- gllm_docproc/loader/html/nested/dictionary_utils.pyi +40 -0
- gllm_docproc/loader/html/nested/html_nested_base_handler.pyi +128 -0
- gllm_docproc/loader/html/nested/html_nested_element_handler.pyi +24 -0
- gllm_docproc/loader/html/nested/html_nested_loader.pyi +15 -0
- gllm_docproc/loader/html/utils/__init__.pyi +0 -0
- gllm_docproc/loader/html/utils/flat_table_utils.pyi +29 -0
- gllm_docproc/loader/html/utils/html_utils.pyi +41 -0
- gllm_docproc/loader/html/utils/removed_components.pyi +53 -0
- gllm_docproc/loader/html/utils/string_utils.pyi +33 -0
- gllm_docproc/loader/html/utils/table_utils.pyi +78 -0
- gllm_docproc/loader/json/__init__.pyi +3 -0
- gllm_docproc/loader/json/json_elements_loader.pyi +33 -0
- gllm_docproc/loader/loader_utils.pyi +42 -0
- gllm_docproc/loader/pdf/__init__.pyi +13 -0
- gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi +40 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi +45 -0
- gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi +50 -0
- gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_loader_utils.pyi +59 -0
- gllm_docproc/loader/pdf/pdf_miner_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi +33 -0
- gllm_docproc/loader/pdf/pdf_plumber_loader.pyi +38 -0
- gllm_docproc/loader/pdf/pymupdf_loader.pyi +43 -0
- gllm_docproc/loader/pdf/pymupdf_span_loader.pyi +44 -0
- gllm_docproc/loader/pdf/pymupdf_utils.pyi +34 -0
- gllm_docproc/loader/pdf/tabula_loader.pyi +32 -0
- gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi +37 -0
- gllm_docproc/loader/pipeline_loader.pyi +48 -0
- gllm_docproc/loader/txt/__init__.pyi +3 -0
- gllm_docproc/loader/txt/txt_loader.pyi +26 -0
- gllm_docproc/loader/xlsx/__init__.pyi +3 -0
- gllm_docproc/loader/xlsx/openpyxl_loader.pyi +37 -0
- gllm_docproc/model/__init__.pyi +4 -0
- gllm_docproc/model/element.pyi +37 -0
- gllm_docproc/model/element_metadata.pyi +35 -0
- gllm_docproc/parser/__init__.pyi +4 -0
- gllm_docproc/parser/base_parser.pyi +29 -0
- gllm_docproc/parser/document/__init__.pyi +6 -0
- gllm_docproc/parser/document/docx_parser.pyi +27 -0
- gllm_docproc/parser/document/pdf_parser.pyi +35 -0
- gllm_docproc/parser/document/txt_parser.pyi +22 -0
- gllm_docproc/parser/document/xlsx_parser.pyi +26 -0
- gllm_docproc/parser/html/__init__.pyi +4 -0
- gllm_docproc/parser/html/flat/__init__.pyi +0 -0
- gllm_docproc/parser/html/flat/html_flat_parser.pyi +27 -0
- gllm_docproc/parser/html/nested/__init__.pyi +0 -0
- gllm_docproc/parser/html/nested/html_json_processor.pyi +158 -0
- gllm_docproc/parser/html/nested/html_nested_parser.pyi +24 -0
- gllm_docproc/parser/html/nested/nested_element.pyi +31 -0
- gllm_docproc/parser/pipeline_parser.pyi +33 -0
- gllm_docproc/parser/table/__init__.pyi +3 -0
- gllm_docproc/parser/table/table_caption_parser.pyi +66 -0
- gllm_docproc/request_handler/__init__.pyi +3 -0
- gllm_docproc/request_handler/base_request_handler.pyi +17 -0
- gllm_docproc/response_handler/__init__.pyi +3 -0
- gllm_docproc/response_handler/base_response_handler.pyi +39 -0
- gllm_docproc/utils/__init__.pyi +0 -0
- gllm_docproc/utils/file_utils.pyi +76 -0
- gllm_docproc/utils/html_constants.pyi +121 -0
- gllm_docproc.build/.gitignore +1 -0
- gllm_docproc.cp312-win_amd64.pyd +0 -0
- gllm_docproc.pyi +149 -0
- gllm_docproc_binary-0.1.8.dist-info/METADATA +110 -0
- gllm_docproc_binary-0.1.8.dist-info/RECORD +123 -0
- gllm_docproc_binary-0.1.8.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
def create_folder(folder_path: str) -> None:
|
|
4
|
+
"""Create a folder.
|
|
5
|
+
|
|
6
|
+
This function check if the folder path exists. If the folder path does not
|
|
7
|
+
exist, the function creates a folder in the specified folder path.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
folder_path (str): The folder path to create.
|
|
11
|
+
"""
|
|
12
|
+
def create_full_path(dir_path: str, filename: str, file_extension: str) -> str:
|
|
13
|
+
"""Create a full path for a file.
|
|
14
|
+
|
|
15
|
+
This function creates a full path for a file by combining the directory
|
|
16
|
+
path, the filename, and the file extension.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
dir_path (str): The directory path.
|
|
20
|
+
filename (str): The filename.
|
|
21
|
+
file_extension (str): The file extension.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str: The full path for the file.
|
|
25
|
+
"""
|
|
26
|
+
def save_to_json(elements: list[dict[str, Any]] | dict[str, Any], folder_path: str, file_name: str) -> None:
|
|
27
|
+
"""Save a list of elements to a JSON file.
|
|
28
|
+
|
|
29
|
+
This function saves a list of elements to a JSON file. The function takes
|
|
30
|
+
the list of elements, the folder path, and the file name as input and saves
|
|
31
|
+
the elements to a JSON file in the specified folder.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
elements (list[dict[str, Any]] | dict[str, Any]): The list of elements to save.
|
|
35
|
+
folder_path (str): The folder path to save the JSON file.
|
|
36
|
+
file_name (str): The file name of the JSON file.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
None
|
|
40
|
+
"""
|
|
41
|
+
def save_to_csv(elements: list[dict[str, Any]], folder_path: str, file_name: str) -> None:
|
|
42
|
+
"""Save a list of elements to a CSV file.
|
|
43
|
+
|
|
44
|
+
This function saves a list of elements to a CSV file. The function takes
|
|
45
|
+
the list of elements, the folder path, and the file name as input and saves
|
|
46
|
+
the elements to a CSV file in the specified folder.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
elements (list[dict[str, Any]]): The list of elements to save.
|
|
50
|
+
folder_path (str): The folder path to save the CSV file.
|
|
51
|
+
file_name (str): The file name of the CSV file.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
None
|
|
55
|
+
"""
|
|
56
|
+
def save_file(content: str, filename: str):
|
|
57
|
+
"""Save the content to a file.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
content (str): The content to save.
|
|
61
|
+
filename (str): The filename to save the content to.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
None
|
|
65
|
+
"""
|
|
66
|
+
def read_json_file(file_path: str) -> list[dict[str, Any]] | dict[str, Any]:
|
|
67
|
+
"""Read a JSON file.
|
|
68
|
+
|
|
69
|
+
This function reads a JSON file and returns the content of the JSON file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path (str): The path of the JSON file to read.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
list[dict[str, Any]] | dict[str, Any]: The content of the JSON file.
|
|
76
|
+
"""
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from gllm_docproc.model.element import AUDIO as AUDIO, FOOTER as FOOTER, HEADER as HEADER, HEADING as HEADING, IMAGE as IMAGE, TABLE as TABLE, TITLE as TITLE, VIDEO as VIDEO
|
|
3
|
+
|
|
4
|
+
FORMATTING_TAGS: Incomplete
|
|
5
|
+
SPACING: str
|
|
6
|
+
|
|
7
|
+
class MetaDataKeys:
|
|
8
|
+
"""Represents keys commonly used in metadata for web content."""
|
|
9
|
+
CHARSET: str
|
|
10
|
+
PROPERTY: str
|
|
11
|
+
CONTENT: str
|
|
12
|
+
NAME: str
|
|
13
|
+
HTTP_EQUIV: str
|
|
14
|
+
URL: str
|
|
15
|
+
TITLE: str
|
|
16
|
+
METADATA: str
|
|
17
|
+
SOURCE: str
|
|
18
|
+
SOURCE_TYPE: str
|
|
19
|
+
LOADED_DATETIME: str
|
|
20
|
+
|
|
21
|
+
class ContentDataKeys:
|
|
22
|
+
"""Represents keys commonly used in web content data."""
|
|
23
|
+
TAG: str
|
|
24
|
+
CONTENT: str
|
|
25
|
+
SOURCE: str
|
|
26
|
+
TYPE: str
|
|
27
|
+
SRC: str
|
|
28
|
+
PLACEHOLDER: str
|
|
29
|
+
TABLE: str
|
|
30
|
+
HREF: str
|
|
31
|
+
ALT: str
|
|
32
|
+
CLASS: str
|
|
33
|
+
VALUE: str
|
|
34
|
+
|
|
35
|
+
class ItemDataKeys:
|
|
36
|
+
"""Represents keys used for handling item data."""
|
|
37
|
+
ELEMENTS: str
|
|
38
|
+
TEXT: str
|
|
39
|
+
STRUCTURE: str
|
|
40
|
+
ELEMENT_ID: str
|
|
41
|
+
INDEX: str
|
|
42
|
+
LINK: str
|
|
43
|
+
FORMATS: str
|
|
44
|
+
COMBINE_PREV: str
|
|
45
|
+
LIST_TYPE: str
|
|
46
|
+
IS_LIST_FIRST_ITEM: str
|
|
47
|
+
METADATA: str
|
|
48
|
+
URL: str
|
|
49
|
+
GROUP_ID: str
|
|
50
|
+
PARENT_ID: str
|
|
51
|
+
LINE_BREAK: str
|
|
52
|
+
HTML_TAGS: str
|
|
53
|
+
ROW_ITEM: str
|
|
54
|
+
COLSPAN: str
|
|
55
|
+
ROWSPAN: str
|
|
56
|
+
|
|
57
|
+
class HTMLTags:
|
|
58
|
+
"""Represents commonly used HTML tags as constants."""
|
|
59
|
+
IMG: str
|
|
60
|
+
INPUT: str
|
|
61
|
+
SVG: str
|
|
62
|
+
SOURCE: str
|
|
63
|
+
TABLE: str
|
|
64
|
+
A: str
|
|
65
|
+
VIDEO: str
|
|
66
|
+
AUDIO: str
|
|
67
|
+
IFRAME: str
|
|
68
|
+
EMBED: str
|
|
69
|
+
TEXT: str
|
|
70
|
+
UL: str
|
|
71
|
+
OL: str
|
|
72
|
+
LI: str
|
|
73
|
+
P: str
|
|
74
|
+
BR: str
|
|
75
|
+
H: Incomplete
|
|
76
|
+
HEADER: str
|
|
77
|
+
TITLE: str
|
|
78
|
+
FOOTER: str
|
|
79
|
+
MEDIA_TAGS: Incomplete
|
|
80
|
+
TR: str
|
|
81
|
+
TD: str
|
|
82
|
+
TH: str
|
|
83
|
+
TBODY: str
|
|
84
|
+
TFOOT: str
|
|
85
|
+
THEAD: str
|
|
86
|
+
|
|
87
|
+
class ErrorMessage:
|
|
88
|
+
"""Represents predefined error messages used in the application."""
|
|
89
|
+
ERROR_FAILED_SAVE_JSON: str
|
|
90
|
+
ERROR_FAILED_SAVE_CSV: str
|
|
91
|
+
ERROR_FAILED_EXTRACT_DATA: str
|
|
92
|
+
ERROR_MISSING_KEY: str
|
|
93
|
+
ERROR_FAILED_TO_PROCESS_ITEM: str
|
|
94
|
+
ERROR_FAILED_TO_OPEN_SPIDER: str
|
|
95
|
+
ERROR_UNKNOWN_SOURCE: str
|
|
96
|
+
|
|
97
|
+
class Structure:
|
|
98
|
+
"""Represents the structure of the content."""
|
|
99
|
+
@classmethod
|
|
100
|
+
def get_structure(cls, tag: str):
|
|
101
|
+
"""Get the structure associated with the given HTML tag.
|
|
102
|
+
|
|
103
|
+
This class method maps HTML tags to their corresponding structure types and returns the
|
|
104
|
+
structure associated with the provided HTML tag.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
tag (str): The HTML tag for which to retrieve the structure.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
str or None: The structure associated with the HTML tag, or None if the tag is not mapped.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
class TableConstants:
|
|
114
|
+
"""Represents constants used for table extraction."""
|
|
115
|
+
TABLE_META_KEY: str
|
|
116
|
+
TABLE_CONTENT_KEY: str
|
|
117
|
+
TABLE_ROW_TYPE_KEY: str
|
|
118
|
+
MAX_CHAR_COUNT_PER_COLUMN: str
|
|
119
|
+
HEADER: str
|
|
120
|
+
BODY: str
|
|
121
|
+
FOOTER: str
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*
|
|
Binary file
|
gllm_docproc.pyi
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# This file was generated by Nuitka
|
|
2
|
+
|
|
3
|
+
# Stubs included by default
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__name__ = ...
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Modules used internally, to allow implicit dependencies to be seen:
|
|
11
|
+
import os
|
|
12
|
+
import abc
|
|
13
|
+
import typing
|
|
14
|
+
import hashlib
|
|
15
|
+
import inspect
|
|
16
|
+
import re
|
|
17
|
+
import langchain_text_splitters
|
|
18
|
+
import gllm_docproc.chunker.table.TableChunker
|
|
19
|
+
import pandas
|
|
20
|
+
import copy
|
|
21
|
+
import datetime
|
|
22
|
+
import scrapy
|
|
23
|
+
import gllm_docproc.downloader.html.exception.ItemScrapeFailedException
|
|
24
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlBaseSpider
|
|
25
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapLinkSpider
|
|
26
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.CrawlSitemapSpider
|
|
27
|
+
import gllm_docproc.downloader.html.utils.clean_url
|
|
28
|
+
import gllm_docproc.downloader.html.utils.is_valid_url
|
|
29
|
+
import scrapy.http
|
|
30
|
+
import scrapy_playwright
|
|
31
|
+
import scrapy_playwright.page
|
|
32
|
+
import urllib
|
|
33
|
+
import urllib.parse
|
|
34
|
+
import scrapy.crawler
|
|
35
|
+
import scrapy.spiders
|
|
36
|
+
import scrapy.spiders.sitemap
|
|
37
|
+
import scrapy.utils
|
|
38
|
+
import scrapy.utils.sitemap
|
|
39
|
+
import scrapy.linkextractors
|
|
40
|
+
import requests
|
|
41
|
+
import billiard
|
|
42
|
+
import gllm_docproc.downloader.html.exception.ZyteApiKeyNotProvidedException
|
|
43
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.ScrapeSpider
|
|
44
|
+
import gllm_docproc.downloader.html.scraper.scraper.spiders.ZyteScrapeSpider
|
|
45
|
+
import gllm_docproc.indexer.BaseIndexer
|
|
46
|
+
import uuid
|
|
47
|
+
import gllm_core
|
|
48
|
+
import gllm_core.utils
|
|
49
|
+
import gllm_core.utils.logger_manager
|
|
50
|
+
import gllm_datastore
|
|
51
|
+
import gllm_datastore.graph_data_store
|
|
52
|
+
import gllm_datastore.graph_data_store.llama_index_graph_rag_data_store
|
|
53
|
+
import gllm_datastore.graph_data_store.llama_index_neo4j_graph_rag_data_store
|
|
54
|
+
import llama_index
|
|
55
|
+
import llama_index.core
|
|
56
|
+
import llama_index.core.base
|
|
57
|
+
import llama_index.core.base.embeddings
|
|
58
|
+
import llama_index.core.base.embeddings.base
|
|
59
|
+
import llama_index.core.base.llms
|
|
60
|
+
import llama_index.core.base.llms.base
|
|
61
|
+
import llama_index.core.indices
|
|
62
|
+
import llama_index.core.indices.property_graph
|
|
63
|
+
import llama_index.core.indices.property_graph.transformations
|
|
64
|
+
import llama_index.core.schema
|
|
65
|
+
import llama_index.core.vector_stores
|
|
66
|
+
import llama_index.core.vector_stores.types
|
|
67
|
+
import gllm_core.utils.imports
|
|
68
|
+
import gllm_misc
|
|
69
|
+
import gllm_misc.knowledge_graph
|
|
70
|
+
import gllm_misc.knowledge_graph.graph_store
|
|
71
|
+
import asyncio
|
|
72
|
+
import gllm_misc.multimodal_manager
|
|
73
|
+
import gllm_misc.multimodal_manager.audio_to_text
|
|
74
|
+
import gllm_misc.multimodal_manager.audio_to_text.audio_to_text
|
|
75
|
+
import gllm_misc.multimodal_manager.schema
|
|
76
|
+
import base64
|
|
77
|
+
import docx2python
|
|
78
|
+
import docx2python.docx_output
|
|
79
|
+
import docx
|
|
80
|
+
import docx.table
|
|
81
|
+
import docx.text
|
|
82
|
+
import docx.text.paragraph
|
|
83
|
+
import gllm_docproc.loader.html.flat.HTMLFlatLoader
|
|
84
|
+
import gllm_docproc.loader.html.nested.HTMLNestedLoader
|
|
85
|
+
import parsel
|
|
86
|
+
import gllm_docproc.loader.html.exception.HtmlLoadException
|
|
87
|
+
import tabulate
|
|
88
|
+
import itertools
|
|
89
|
+
import __future__
|
|
90
|
+
import re.sub
|
|
91
|
+
import w3lib
|
|
92
|
+
import w3lib.html
|
|
93
|
+
import json
|
|
94
|
+
import gllm_docproc.loader.exception.UnsupportedFileExtensionError
|
|
95
|
+
import csv
|
|
96
|
+
import io
|
|
97
|
+
import zipfile
|
|
98
|
+
import adobe
|
|
99
|
+
import adobe.pdfservices
|
|
100
|
+
import adobe.pdfservices.operation
|
|
101
|
+
import adobe.pdfservices.operation.auth
|
|
102
|
+
import adobe.pdfservices.operation.auth.service_principal_credentials
|
|
103
|
+
import adobe.pdfservices.operation.io
|
|
104
|
+
import adobe.pdfservices.operation.io.cloud_asset
|
|
105
|
+
import adobe.pdfservices.operation.io.stream_asset
|
|
106
|
+
import adobe.pdfservices.operation.pdf_services
|
|
107
|
+
import adobe.pdfservices.operation.pdf_services_media_type
|
|
108
|
+
import adobe.pdfservices.operation.pdf_services_response
|
|
109
|
+
import adobe.pdfservices.operation.pdfjobs
|
|
110
|
+
import adobe.pdfservices.operation.pdfjobs.jobs
|
|
111
|
+
import adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job
|
|
112
|
+
import adobe.pdfservices.operation.pdfjobs.params
|
|
113
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf
|
|
114
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type
|
|
115
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params
|
|
116
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_renditions_element_type
|
|
117
|
+
import adobe.pdfservices.operation.pdfjobs.params.extract_pdf.table_structure_type
|
|
118
|
+
import adobe.pdfservices.operation.pdfjobs.result
|
|
119
|
+
import adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result
|
|
120
|
+
import azure
|
|
121
|
+
import azure.ai
|
|
122
|
+
import azure.ai.documentintelligence
|
|
123
|
+
import azure.ai.documentintelligence.models
|
|
124
|
+
import azure.core
|
|
125
|
+
import azure.core.credentials
|
|
126
|
+
import collections
|
|
127
|
+
import collections.Counter
|
|
128
|
+
import pdfminer
|
|
129
|
+
import pdfminer.high_level
|
|
130
|
+
import pdfminer.layout
|
|
131
|
+
import pdfplumber
|
|
132
|
+
import pdfplumber._typing
|
|
133
|
+
import pdfplumber.page
|
|
134
|
+
import pdfplumber.table
|
|
135
|
+
import fitz
|
|
136
|
+
import tabula
|
|
137
|
+
import tabula.io
|
|
138
|
+
import gllm_datastore.cache_data_store
|
|
139
|
+
import gllm_datastore.cache_data_store.cache_data_store
|
|
140
|
+
import gllm_datastore.cache_data_store.utils
|
|
141
|
+
import openpyxl
|
|
142
|
+
import openpyxl.cell
|
|
143
|
+
import openpyxl.cell.cell
|
|
144
|
+
import openpyxl.worksheet
|
|
145
|
+
import openpyxl.worksheet.worksheet
|
|
146
|
+
import pydantic
|
|
147
|
+
import math
|
|
148
|
+
import gllm_docproc.parser.BaseParser
|
|
149
|
+
import ntpath
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: gllm-docproc-binary
|
|
3
|
+
Version: 0.1.8
|
|
4
|
+
Summary: A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
|
|
5
|
+
Author: GenAI SDK Team
|
|
6
|
+
Author-email: gat-sdk@gdplabs.id
|
|
7
|
+
Requires-Python: >=3.11,<3.13
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Provides-Extra: audio
|
|
12
|
+
Provides-Extra: docx
|
|
13
|
+
Provides-Extra: html
|
|
14
|
+
Provides-Extra: kg
|
|
15
|
+
Provides-Extra: pdf
|
|
16
|
+
Provides-Extra: xlsx
|
|
17
|
+
Requires-Dist: azure-ai-documentintelligence (>=1.0.0b3,<2.0.0) ; extra == "pdf"
|
|
18
|
+
Requires-Dist: billiard (>=4.2.1,<5.0.0) ; extra == "html"
|
|
19
|
+
Requires-Dist: docx2python (==2.8.0) ; extra == "docx"
|
|
20
|
+
Requires-Dist: gllm-core-binary
|
|
21
|
+
Requires-Dist: gllm-datastore-binary
|
|
22
|
+
Requires-Dist: gllm-misc-binary[audio,kg]
|
|
23
|
+
Requires-Dist: jpype1 (>=1.5.0,<2.0.0) ; extra == "pdf"
|
|
24
|
+
Requires-Dist: langchain-text-splitters (>=0.3.2,<0.4.0)
|
|
25
|
+
Requires-Dist: libmagic (>=1.0,<2.0) ; sys_platform == "win32"
|
|
26
|
+
Requires-Dist: librosa (==0.10.1) ; extra == "audio"
|
|
27
|
+
Requires-Dist: llama-index-embeddings-openai (>=0.3.0,<0.4.0) ; extra == "kg"
|
|
28
|
+
Requires-Dist: llama-index-llms-openai (>=0.3.0,<0.4.0) ; extra == "kg"
|
|
29
|
+
Requires-Dist: openpyxl (>=3.0.10,<4.0.0) ; extra == "xlsx"
|
|
30
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
31
|
+
Requires-Dist: pdfminer-six (>=20231228,<20231229) ; extra == "pdf"
|
|
32
|
+
Requires-Dist: pdfplumber (>=0.11.4,<0.12.0) ; extra == "pdf"
|
|
33
|
+
Requires-Dist: pdfservices-sdk (>=4.0.0,<5.0.0) ; extra == "pdf"
|
|
34
|
+
Requires-Dist: playwright (>=1.40.0,<2.0.0) ; extra == "html"
|
|
35
|
+
Requires-Dist: pydantic (>=2.9.1,<3.0.0)
|
|
36
|
+
Requires-Dist: pymupdf (>=1.24.10,<2.0.0) ; extra == "pdf"
|
|
37
|
+
Requires-Dist: python-docx (==1.1.0) ; extra == "docx"
|
|
38
|
+
Requires-Dist: python-magic-bin (>=0.4.14,<0.5.0) ; sys_platform == "win32"
|
|
39
|
+
Requires-Dist: scrapy (>=2.11.0,<3.0.0) ; extra == "html"
|
|
40
|
+
Requires-Dist: scrapy-playwright (>=0.0.33,<0.1.0) ; extra == "html"
|
|
41
|
+
Requires-Dist: scrapy_zyte_api (>=0.12.2,<0.13.0) ; extra == "html"
|
|
42
|
+
Requires-Dist: tabula-py (>=2.9.3,<3.0.0) ; extra == "pdf"
|
|
43
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0) ; extra == "pdf"
|
|
44
|
+
Requires-Dist: tqdm (==4.66.2) ; extra == "audio"
|
|
45
|
+
Requires-Dist: zyte-api (>=0.4.8,<0.5.0) ; extra == "html"
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# GDP Labs Language Model Document Processing Orchestrator
|
|
49
|
+
|
|
50
|
+
## Description
|
|
51
|
+
|
|
52
|
+
A library for orchestrating the processing of document. Typically in a Gen AI applications (but not limited to just Gen AI).
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
1. Python v3.11 or above:
|
|
57
|
+
|
|
58
|
+
You can install Python using [Miniconda](https://docs.anaconda.com/free/miniconda/index.html).
|
|
59
|
+
|
|
60
|
+
2. Make sure you're in the `base` conda environment:
|
|
61
|
+
```bash
|
|
62
|
+
conda activate
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
3. [Poetry](https://python-poetry.org/docs/) v1.8.1 or above:
|
|
66
|
+
|
|
67
|
+
You can install Poetry using cURL (you need Python to install Poetry):
|
|
68
|
+
```bash
|
|
69
|
+
curl -sSL https://install.python-poetry.org | python3 -
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
4. Install the library using Poetry:
|
|
73
|
+
```bash
|
|
74
|
+
# Latest
|
|
75
|
+
poetry add "git+ssh://git@github.com/GDP-ADMIN/gen-ai-internal.git#subdirectory=libs/gllm-docproc"
|
|
76
|
+
|
|
77
|
+
# Specific version
|
|
78
|
+
poetry add "git+ssh://git@github.com/GDP-ADMIN/gen-ai-internal.git@gllm_docproc-v0.0.1-beta.1#subdirectory=libs/gllm-docproc"
|
|
79
|
+
|
|
80
|
+
# This PR
|
|
81
|
+
poetry add "git+ssh://git@github.com/GDP-ADMIN/gen-ai-internal.git@decision/separate-document-processing#subdirectory=libs/gllm-docproc"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
5. At this step, you can deactivate Miniconda environment as Poetry will create and manage its own virtual environment for you.
|
|
85
|
+
```bash
|
|
86
|
+
conda deactivate
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Managing Dependencies
|
|
90
|
+
1. Go to root folder of `gllm-docproc` module, e.g. `cd libs/gllm-docproc`.
|
|
91
|
+
2. Run `poetry shell` to create a virtual environment.
|
|
92
|
+
3. Run `poetry lock` to create a lock file if you haven't done it yet.
|
|
93
|
+
4. Run `poetry install` to install the `gllm-docproc` requirements for the first time.
|
|
94
|
+
5. Run `poetry update` if you update any dependency module version at `pyproject.toml`.
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
## Contributing
|
|
98
|
+
Please refer to this [Python Style Guide](https://docs.google.com/document/d/1uRggCrHnVfDPBnG641FyQBwUwLoFw0kTzNqRm92vUwM/edit?usp=sharing)
|
|
99
|
+
to get information about code style, documentation standard, and SCA that you need to use when contributing to this project
|
|
100
|
+
|
|
101
|
+
1. Activate `pre-commit` hooks using `pre-commit install`
|
|
102
|
+
2. Run `poetry shell` to create a virtual environment.
|
|
103
|
+
3. Run `poetry lock` to create a lock file if you haven't done it yet.
|
|
104
|
+
4. Run `poetry install` to install the `gllm-docproc` requirements for the first time.
|
|
105
|
+
5. Run `which python` to get the path to be referenced at Visual Studio Code interpreter path (`Ctrl`+`Shift`+`P` or `Cmd`+`Shift`+`P`)
|
|
106
|
+
6. Try running the unit test to see if it's working:
|
|
107
|
+
```bash
|
|
108
|
+
poetry run pytest -s tests/unit_tests/
|
|
109
|
+
```
|
|
110
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
gllm_docproc/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
gllm_docproc/chunker/__init__.pyi,sha256=GOOIYg0-Fjd3g9uJDo9q8J0Gabwt_GHD_44axN6Y-qc,83
|
|
3
|
+
gllm_docproc/chunker/base_chunker.pyi,sha256=QPSYXqM9PdVFkQ46iPa3PsQ39N4fjK9luhBXfYHGYLU,1195
|
|
4
|
+
gllm_docproc/chunker/structured_element/__init__.pyi,sha256=0SzEj-OALKTVr4v4WgKJVZUnemuzMkhunZndFhPlR4w,136
|
|
5
|
+
gllm_docproc/chunker/structured_element/chunk_enricher.pyi,sha256=xJXf_rornekHmNiu-AiYmyT0iDw2E1AfDFf8qk-mZTw,1747
|
|
6
|
+
gllm_docproc/chunker/structured_element/structured_element_chunker.pyi,sha256=GGBbuWBhX58m2noi9LAl0OGWiZEG6eyzX5IL6t4cNhk,4143
|
|
7
|
+
gllm_docproc/chunker/table/__init__.pyi,sha256=HP844LD8YE2VZWqiYLFEOYIM3qIHggqHm5BCAn15xX4,162
|
|
8
|
+
gllm_docproc/chunker/table/table_chunker.pyi,sha256=EqqDL_l9olB81p3x_EG-bjNj2-RizHrLmOA5VVwQf0o,1898
|
|
9
|
+
gllm_docproc/converter/__init__.pyi,sha256=jqqxJRyzpYAPcH6HaFjeuVTGAoxgEvMSOYc0SR2iy6c,91
|
|
10
|
+
gllm_docproc/converter/base_converter.pyi,sha256=8tWnLsURnTbcMf6Wstr3skPo7oxS6NxavZVZk08zBeo,477
|
|
11
|
+
gllm_docproc/data_generator/__init__.pyi,sha256=9ApW8nMNwWv-VOOvH2lzWViK19p0_HjMbaW3OpZdpSo,135
|
|
12
|
+
gllm_docproc/data_generator/base_data_generator.pyi,sha256=ZfB_aIQn61kgf0tSnCyjgJ82SbznC8HUdFLEXfMJ7WE,825
|
|
13
|
+
gllm_docproc/downloader/__init__.pyi,sha256=1pilbSFFWxeOdspqPFJnfeLZx9Z-CBuS44uu-xo997I,95
|
|
14
|
+
gllm_docproc/downloader/base_downloader.pyi,sha256=RqzyDCfOPYmlWHEk6gO68WjrWn-wWk_E5sjfONUk8Oc,515
|
|
15
|
+
gllm_docproc/downloader/html/__init__.pyi,sha256=KE2vvVGYIZli01gJd69xFxA63A1Q6F76bJlwRUc6Nds,134
|
|
16
|
+
gllm_docproc/downloader/html/exception/__init__.pyi,sha256=Sx4tSQh97yLWL5dSd6ZtdUiNEpoMpupTVDvJWHVjA9g,290
|
|
17
|
+
gllm_docproc/downloader/html/exception/item_scrape_failed_exception.pyi,sha256=PB0Uurm2v3anDds1eV9IGnw8fzuTjAZNc_E2sl_LrWM,623
|
|
18
|
+
gllm_docproc/downloader/html/exception/zyte_api_key_not_provided_exception.pyi,sha256=UD1WzdOLnm4pWgu3BSVen9Hp41FzZ_9W_8eDF9FfDJQ,604
|
|
19
|
+
gllm_docproc/downloader/html/html_downloader.pyi,sha256=KJ2E6VMWQc6HEjhtMVAji6fzCc7C-JdwuLZOJ8id4fY,4470
|
|
20
|
+
gllm_docproc/downloader/html/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
gllm_docproc/downloader/html/scraper/scraper/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/__init__.pyi,sha256=2FI0gNdvv8PDRTjfEr_1xZ1PNhufmw2i_T1PhpZReOM,667
|
|
23
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_pdf_spider.pyi,sha256=CW5cPK-E3iT0n3TGrludEd_3WPQZ8Y0_15SYDB8ed_w,1392
|
|
24
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_link_spider.pyi,sha256=Q_vEX9hx6EgB-0k1CPyj4fFovllpp5JfPYGq_fbE4A4,1205
|
|
25
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_sitemap_spider.pyi,sha256=M1r5Q1J7Jf6GTYbDB-w1V3J9auAOBm5U5O1mVETdGB0,2606
|
|
26
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/crawl_spider.pyi,sha256=7blt3Vqz-cRpSc-25q2y0CmRbfvl43G2baCcbmYeQKY,2267
|
|
27
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/playwright_scrape_spider.pyi,sha256=3v5wxj_DPuQc0rLJ0rKNXk2o_yDgCAvVek0CtZVC5aw,972
|
|
28
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/scrape_spider.pyi,sha256=0-_fj_1xghGUOpGotaTS5ufzrYeF2a574QwDB7_ON2w,2162
|
|
29
|
+
gllm_docproc/downloader/html/scraper/scraper/spiders/zyte_scrape_spider.pyi,sha256=8QISd1lr1ICoZ7ZiaKGUMqLTkecE95Ww7c8W4ONTppk,2215
|
|
30
|
+
gllm_docproc/downloader/html/scraper/web_scraper_executor.pyi,sha256=9Q2_vfys5tzBZS8Qot48hkDUVbdnP19wLKiBmDM_hAc,2199
|
|
31
|
+
gllm_docproc/downloader/html/utils/__init__.pyi,sha256=rjlatKvMgkH6oHi6IkvKfHqZ9NiZEWeIBZtoR30gV9I,120
|
|
32
|
+
gllm_docproc/downloader/html/utils/web_utils.pyi,sha256=P3HSnFB4wucKNhBsJqW_wYMvhWoZ_Awivya5quPF_Z8,511
|
|
33
|
+
gllm_docproc/dpo_router/__init__.pyi,sha256=6rG_7XUQe-3a-gjoyWwmJQ5zMCkIqF7eL9kURy_kX6k,92
|
|
34
|
+
gllm_docproc/dpo_router/base_dpo_router.pyi,sha256=zBgD_U6yIuQaWSqI9B93AvFrskud0PX-CkdylSNq0YY,630
|
|
35
|
+
gllm_docproc/housekeeping/__init__.pyi,sha256=oL-C1roDf8io8zh90D4ugD_jAXdB_1CCcggZB1b5sTs,103
|
|
36
|
+
gllm_docproc/housekeeping/base_housekeeping.pyi,sha256=rptulu_Z3ufhKif8X-r3osNzHWJ7VuR1Z--sJQPXCXE,465
|
|
37
|
+
gllm_docproc/indexer/__init__.pyi,sha256=DAX3M09IFTLRzUmyd4XL0a6jhnZmE8UEPxZ-5dTlSt8,103
|
|
38
|
+
gllm_docproc/indexer/base_indexer.pyi,sha256=V5YhXNSbDixOe192JKMAD3lxEC0tyUhJCoPOrVJlRpo,1117
|
|
39
|
+
gllm_docproc/indexer/graph/__init__.pyi,sha256=pfuwDUwwVKP55TfX6jqQQPYETGUoMttKYcaznjH108A,142
|
|
40
|
+
gllm_docproc/indexer/knowledge_graph/__init__.pyi,sha256=7bSKY080wRwYlV59z1-ntIKBMQSVNmbtX3IfJBYu1b0,267
|
|
41
|
+
gllm_docproc/loader/__init__.pyi,sha256=Lzsi_ajlFYzu2teZ2kKiu7HHRHz7-M5ubpJ7XRmZHcg,160
|
|
42
|
+
gllm_docproc/loader/audio/__init__.pyi,sha256=bJMhxunQuKPnJzjXiIkJTtqdaefxb9f33Xw2QB2Wco0,83
|
|
43
|
+
gllm_docproc/loader/base_loader.pyi,sha256=eNorzdAQt-wMddX1S7I9glVSR7yyb56UldsimFuLnJI,1387
|
|
44
|
+
gllm_docproc/loader/docx/__init__.pyi,sha256=5JNIvnZyO9ZOMHJHSemAsFafwAf20iUxAiMDnR1U17I,308
|
|
45
|
+
gllm_docproc/loader/docx/docx2python_loader.pyi,sha256=z9EcFuqnXgvzqhnoSWft6oLgBHf63qyrvoYRLmxx0X8,2520
|
|
46
|
+
gllm_docproc/loader/docx/python_docx_loader.pyi,sha256=3ulT68shiwzK1dOopdgZKWcUTAuMcGhZIrJjwEhQC6M,2051
|
|
47
|
+
gllm_docproc/loader/docx/python_docx_table_loader.pyi,sha256=tQp_Fypis_pz_3VZddrpujV4abzX1xtoXOaDFJSe3R4,1756
|
|
48
|
+
gllm_docproc/loader/exception/__init__.pyi,sha256=G2CZHyjT_L0Y6h9XPjQ_MPVvg_P_mChBTQ-FtoW0cfs,157
|
|
49
|
+
gllm_docproc/loader/exception/unsupported_file_extension_error.pyi,sha256=c5xVjuQPNYZSuvlCHi87KMfrhBK-4R_i7-wr69QdUvQ,268
|
|
50
|
+
gllm_docproc/loader/html/__init__.pyi,sha256=hzLamxCb2AwLI8xO9ty0f-qY0kD97iAXCwUfBJyxkYw,244
|
|
51
|
+
gllm_docproc/loader/html/exception/__init__.pyi,sha256=4pNokkFZd_UWkgce5hvYUS6f5F_b1xZ_DShwmWakUDA,108
|
|
52
|
+
gllm_docproc/loader/html/exception/html_load_exception.pyi,sha256=R9Was3AEqcUXqqbARpgfQ5rnRS0pTL8gPXimqbWXFbo,261
|
|
53
|
+
gllm_docproc/loader/html/flat/__init__.pyi,sha256=GDjScQBkVstxxPnK4DIc3UE81shLXHjmJxgim7q1oj0,96
|
|
54
|
+
gllm_docproc/loader/html/flat/html_flat_base_handler.pyi,sha256=I0zPFG9Od4hvDu-GcN0VETDwgPwLfnT6b7cMsLdqCms,2706
|
|
55
|
+
gllm_docproc/loader/html/flat/html_flat_loader.pyi,sha256=i899Dop4zX2spr8K7VNwOtwax7CNqksjxGGnjFZ94hU,1839
|
|
56
|
+
gllm_docproc/loader/html/flat/html_flat_merger.pyi,sha256=mbgUwo90fZMiW20b-J_K89KgWCtmUL8G4KrUCzQt-5Q,1406
|
|
57
|
+
gllm_docproc/loader/html/html_base_loader.pyi,sha256=zPdRd0mMw90Q6pwJNzXWWEgea-7tYyHq0Pv1eSL3QZw,1185
|
|
58
|
+
gllm_docproc/loader/html/nested/__init__.pyi,sha256=nG7Z3zV4Z4KIG2MVWgaB7V6LA9LyN3JkgMkWdItA0dA,104
|
|
59
|
+
gllm_docproc/loader/html/nested/dictionary_utils.pyi,sha256=mcREqjBuSZ01idMXHu4Kfqa4aITdb4Mvlp8x7SZHX3U,1534
|
|
60
|
+
gllm_docproc/loader/html/nested/html_nested_base_handler.pyi,sha256=CDxr-Tq0qY5sM8fQujdxDr8YTnY2CJJj1PMeB28HzNQ,4862
|
|
61
|
+
gllm_docproc/loader/html/nested/html_nested_element_handler.pyi,sha256=d-vld6eNj_tjcLNRO2nX41Q_vdlZxoiq4kO2zrBuw2k,940
|
|
62
|
+
gllm_docproc/loader/html/nested/html_nested_loader.pyi,sha256=bOw4p5r4H7qYjY6F6ndDdlkGjjqz7Ze9A47oh19yhwo,932
|
|
63
|
+
gllm_docproc/loader/html/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
gllm_docproc/loader/html/utils/flat_table_utils.pyi,sha256=0kKk6-iQOa__3-JWH92uaPhLJn9kjgKLAa5bFpoAOgE,1231
|
|
65
|
+
gllm_docproc/loader/html/utils/html_utils.pyi,sha256=wmNdZ7pxy0J1YgtUec0gRu4t0Gi3vc3hv_CIwi9KHig,1685
|
|
66
|
+
gllm_docproc/loader/html/utils/removed_components.pyi,sha256=wwpHUfu1HCMNIf8pvS52ibBKx3iAOXpq0TjkFPllGm8,2143
|
|
67
|
+
gllm_docproc/loader/html/utils/string_utils.pyi,sha256=HiKHATp4DGiORMI5WGadVKuNsqHMLVISIJi4MRXQhUs,1125
|
|
68
|
+
gllm_docproc/loader/html/utils/table_utils.pyi,sha256=Y0fVMcdBSwyTZj4NCVIiUF0z5yP9ssDsOvffS5WSSRA,2839
|
|
69
|
+
gllm_docproc/loader/json/__init__.pyi,sha256=UwKxlnJQCMBevsVLtEdNqbUwTjSUDAiGVQ1RPCiXQYc,112
|
|
70
|
+
gllm_docproc/loader/json/json_elements_loader.pyi,sha256=jYR0bZoVTI678Syx8dviN5EwacQ5gXl3Af0wqtwol64,1488
|
|
71
|
+
gllm_docproc/loader/loader_utils.pyi,sha256=Ek1B89xX5qigd_qI8yqX8dDTKC-4uCrEeJ2AyPc6su8,1727
|
|
72
|
+
gllm_docproc/loader/pdf/__init__.pyi,sha256=TCxrhdrRBUdrN8vsO-gGgQx5I97KEBEwNRk2LgeU634,1212
|
|
73
|
+
gllm_docproc/loader/pdf/adobe_pdf_extract_loader.pyi,sha256=YiKySUyKrtWVe_LmyRKmr9WHT_Z_ZakY8sojpplg5ks,1999
|
|
74
|
+
gllm_docproc/loader/pdf/azure_ai_document_intelligence_loader.pyi,sha256=spI9Xx3Bd9pNlQHUnUb19WivKKQ538RJGqI9ukYA3tM,2508
|
|
75
|
+
gllm_docproc/loader/pdf/azure_ai_document_intelligence_raw_loader.pyi,sha256=Qh9VCxO1VCFllmgqM5lnG8BO_M3NVM9Bfdy3TqnykpQ,2758
|
|
76
|
+
gllm_docproc/loader/pdf/glair_vision_ocr_loader.pyi,sha256=WDY5FX7OcINZKU_IZJY7qHrmzW-YQiOLMjWmyZqS6P8,1698
|
|
77
|
+
gllm_docproc/loader/pdf/pdf_loader_utils.pyi,sha256=ztwgYvjM80ESRZ5obioIidGwfm-TPfcC0Qd5nPCdH1Y,2997
|
|
78
|
+
gllm_docproc/loader/pdf/pdf_miner_loader.pyi,sha256=Ssk6kqgzucDpK_ohASfXvg5bK53dm8ktSmhClxjDsAU,2254
|
|
79
|
+
gllm_docproc/loader/pdf/pdf_miner_word_loader.pyi,sha256=vpikwwzR6j26VO3UdY_UqcUavM-TozgxSwSSYTl50HY,1551
|
|
80
|
+
gllm_docproc/loader/pdf/pdf_plumber_loader.pyi,sha256=EJlRHi2DvGDhdXRvpbH5ACsJTp4ocJhb8zjtxHq8kMU,2186
|
|
81
|
+
gllm_docproc/loader/pdf/pymupdf_loader.pyi,sha256=Hv4yTqTJzhBZIMtCinZfuUzaVjUsHK_3g5W_jQlNu7g,2786
|
|
82
|
+
gllm_docproc/loader/pdf/pymupdf_span_loader.pyi,sha256=CWPL9PhLeqFZ8Vf_r2q7fzUTWu2rD3x4optX7SMppiI,2806
|
|
83
|
+
gllm_docproc/loader/pdf/pymupdf_utils.pyi,sha256=HAfcj0jijuS5DAu0GVOU2HCuP8cbXuELtAnPNJpB3IE,1712
|
|
84
|
+
gllm_docproc/loader/pdf/tabula_loader.pyi,sha256=zNhW3KVr3YQLyMJQamHXd2B_QQ925r1IqprHiXM42Z8,1788
|
|
85
|
+
gllm_docproc/loader/pdf/text_inject_pdf_plumber_loader.pyi,sha256=xRWomeYCvzMca1YeSNQYD8ArDpHDQf9Wz9gn3dTAfX0,2045
|
|
86
|
+
gllm_docproc/loader/pipeline_loader.pyi,sha256=QW7SHv-0hzF1wwXbSD3anrbJFkd601KhTzz7KA8XVVM,1852
|
|
87
|
+
gllm_docproc/loader/txt/__init__.pyi,sha256=fwhz7Y79UKInwtd_4tnq3sXFH4YdsfE-zjykWnZKmLI,75
|
|
88
|
+
gllm_docproc/loader/txt/txt_loader.pyi,sha256=rZCcwF7wUVsS2PnwwBFyTF8juLSoaO83MQRl_cOfvdQ,1198
|
|
89
|
+
gllm_docproc/loader/xlsx/__init__.pyi,sha256=bdF9g2QKvO0E_aTBdfbwXKivHTAohdRHBQYMou8Yr2s,95
|
|
90
|
+
gllm_docproc/loader/xlsx/openpyxl_loader.pyi,sha256=ebxyC1yyNyF4BHnbGScXZS3ORwOEzjXaXL17HvdHoK4,2019
|
|
91
|
+
gllm_docproc/model/__init__.pyi,sha256=gXhu83AiLNwoFpv_XoJdUpPTl3BVo30wV-mWkGPA1mM,151
|
|
92
|
+
gllm_docproc/model/element.pyi,sha256=w9VQdeR0JzRvtDOdIzQkzFEVRyeGXj_prvAnRYFkIEw,1085
|
|
93
|
+
gllm_docproc/model/element_metadata.pyi,sha256=p9WqqfPfRpLZaZIkkAfRVzRvt-8OhaT-gZgCc_tLXZQ,846
|
|
94
|
+
gllm_docproc/parser/__init__.pyi,sha256=7ylnmzWFyW4_XTVkxyj9iaIfRfBQu6_d5c-Jj781nlY,160
|
|
95
|
+
gllm_docproc/parser/base_parser.pyi,sha256=v_k-NpEXM4D7m3nVH7v16oTzGRV368Df8C-Tnbuqfso,1198
|
|
96
|
+
gllm_docproc/parser/document/__init__.pyi,sha256=J-_Vko_oV4tBjDziORx7a70Wxp0Kk29rz2JbkXLr96E,266
|
|
97
|
+
gllm_docproc/parser/document/docx_parser.pyi,sha256=XeMY-pNyEUoUg4xqtc7XMmGGImumWMmzC9C9H4iMTB4,1546
|
|
98
|
+
gllm_docproc/parser/document/pdf_parser.pyi,sha256=ZJvY4TkVsXAZw_NM6vBjqPgYmk__gI3Nkox4-oQCStQ,1546
|
|
99
|
+
gllm_docproc/parser/document/txt_parser.pyi,sha256=sdERCHZ_lR6B7Q0oTDKp0oQm55mPVH1M-04fuZWpzkI,885
|
|
100
|
+
gllm_docproc/parser/document/xlsx_parser.pyi,sha256=yylVDHfUm5i_jD0ECjo2KamVxGjsrV2_t86TTsNDgdU,1094
|
|
101
|
+
gllm_docproc/parser/html/__init__.pyi,sha256=DAG6lL1SfvN7euCP0XSq2Bqmai931jx9_Oj8u4LGN7I,198
|
|
102
|
+
gllm_docproc/parser/html/flat/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
|
+
gllm_docproc/parser/html/flat/html_flat_parser.pyi,sha256=CwQZbYSn4QpeQ_uzGUcbFTl9jDy8BtYiZzJuK5vUygM,1258
|
|
104
|
+
gllm_docproc/parser/html/nested/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
|
+
gllm_docproc/parser/html/nested/html_json_processor.pyi,sha256=BSOhYl_Sp3eNn5gR62LYpRjNIM52Sp9zWed5Ukyyweo,6498
|
|
106
|
+
gllm_docproc/parser/html/nested/html_nested_parser.pyi,sha256=O3JhTWHIXWKNCl4kfk4lWH1SQRiv3bRbu_OMDSHicFA,1014
|
|
107
|
+
gllm_docproc/parser/html/nested/nested_element.pyi,sha256=-NnyjyGtP6BzQ1XQQNsm66FqL9tt75EYCxNQfh30ZGc,1277
|
|
108
|
+
gllm_docproc/parser/pipeline_parser.pyi,sha256=TBVbZQEdU9T-oNE4r9BE0VW1wu_sjy6VPPNVvwMdG9U,1270
|
|
109
|
+
gllm_docproc/parser/table/__init__.pyi,sha256=a2Zvp3tmJat1Kgp7ZWbikqzAviRVGyVLUcS9PGZ4owo,112
|
|
110
|
+
gllm_docproc/parser/table/table_caption_parser.pyi,sha256=bpp_vKDYRPyrveOBIiURYDGh5yBDU04PoenokLlRzRY,2770
|
|
111
|
+
gllm_docproc/request_handler/__init__.pyi,sha256=nulIcWHCBdrJHMbfJGIwfJmDnlcAI76FnRl-0xt5myw,112
|
|
112
|
+
gllm_docproc/request_handler/base_request_handler.pyi,sha256=uoVfkLrbu0X1nGd1tb-4jQctbnLQvKefidgXQdvOBG4,496
|
|
113
|
+
gllm_docproc/response_handler/__init__.pyi,sha256=JcTvIdJ4heHLgOcB4i6KGKfOSUVhjkGNhUXmxPgelXI,116
|
|
114
|
+
gllm_docproc/response_handler/base_response_handler.pyi,sha256=9dd-hLidcH427r4ArLBU2_Q27OObnp-kaqGud2O8ROY,1322
|
|
115
|
+
gllm_docproc/utils/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
|
+
gllm_docproc/utils/file_utils.pyi,sha256=Jra4dWe3OMRLX3B_zQz7JuCZfA7Tm0PJfuxJTLXHhko,2641
|
|
117
|
+
gllm_docproc/utils/html_constants.pyi,sha256=-4Gaw3aGFz33-8jDqT8pMw0jSaiKOhjjRHzEVVswsx8,2920
|
|
118
|
+
gllm_docproc.build/.gitignore,sha256=aEiIwOuxfzdCmLZe4oB1JsBmCUxwG8x-u-HBCV9JT8E,1
|
|
119
|
+
gllm_docproc.cp312-win_amd64.pyd,sha256=rh64eo08UeuTsj8ZFGTc-xQLzyRD5rO--eFhfVbBIog,3016192
|
|
120
|
+
gllm_docproc.pyi,sha256=7HEXCdRwlq3zEY8FpmlpxffNOwehQiODxU7R1cocs2U,4952
|
|
121
|
+
gllm_docproc_binary-0.1.8.dist-info/METADATA,sha256=NdGZBZM1pBiIpw2FqAMRz5NHfTfmlOe1bSegEggsPZU,4725
|
|
122
|
+
gllm_docproc_binary-0.1.8.dist-info/WHEEL,sha256=4N0hGcnWMI_Ty6ATf4qJqqSl-UNI-Ln828iTWGIywmU,98
|
|
123
|
+
gllm_docproc_binary-0.1.8.dist-info/RECORD,,
|