nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Type, Any
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AccessLevelEnum(int, Enum):
|
|
14
|
+
"""
|
|
15
|
+
Note
|
|
16
|
+
----
|
|
17
|
+
This is for future use, and currently has no functional use case.
|
|
18
|
+
|
|
19
|
+
Enum for representing different access levels.
|
|
20
|
+
|
|
21
|
+
Attributes
|
|
22
|
+
----------
|
|
23
|
+
LEVEL_1 : int
|
|
24
|
+
Represents access level 1.
|
|
25
|
+
LEVEL_2 : int
|
|
26
|
+
Represents access level 2.
|
|
27
|
+
LEVEL_3 : int
|
|
28
|
+
Represents access level 3.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
UNKNOWN: int = -1
|
|
32
|
+
LEVEL_1: int = 1
|
|
33
|
+
LEVEL_2: int = 2
|
|
34
|
+
LEVEL_3: int = 3
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ContentDescriptionEnum(str, Enum):
|
|
38
|
+
"""
|
|
39
|
+
Enum for standard content descriptions extracted from different source types.
|
|
40
|
+
|
|
41
|
+
Attributes
|
|
42
|
+
----------
|
|
43
|
+
DOCX_IMAGE : str
|
|
44
|
+
Description for image extracted from DOCX document.
|
|
45
|
+
DOCX_TABLE : str
|
|
46
|
+
Description for structured table extracted from DOCX document.
|
|
47
|
+
DOCX_TEXT : str
|
|
48
|
+
Description for unstructured text from DOCX document.
|
|
49
|
+
PDF_CHART : str
|
|
50
|
+
Description for structured chart extracted from PDF document.
|
|
51
|
+
PDF_IMAGE : str
|
|
52
|
+
Description for image extracted from PDF document.
|
|
53
|
+
PDF_INFOGRAPHIC : str
|
|
54
|
+
Description for structured infographic extracted from PDF document.
|
|
55
|
+
PDF_PAGE_IMAGE : str
|
|
56
|
+
Description for a full-page image rendered from a PDF document.
|
|
57
|
+
PDF_TABLE : str
|
|
58
|
+
Description for structured table extracted from PDF document.
|
|
59
|
+
PDF_TEXT : str
|
|
60
|
+
Description for unstructured text from PDF document.
|
|
61
|
+
PPTX_IMAGE : str
|
|
62
|
+
Description for image extracted from PPTX presentation.
|
|
63
|
+
PPTX_TABLE : str
|
|
64
|
+
Description for structured table extracted from PPTX presentation.
|
|
65
|
+
PPTX_TEXT : str
|
|
66
|
+
Description for unstructured text from PPTX presentation.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
DOCX_IMAGE: str = "Image extracted from DOCX document."
|
|
70
|
+
DOCX_TABLE: str = "Structured table extracted from DOCX document."
|
|
71
|
+
DOCX_TEXT: str = "Unstructured text from DOCX document."
|
|
72
|
+
PDF_CHART: str = "Structured chart extracted from PDF document."
|
|
73
|
+
PDF_IMAGE: str = "Image extracted from PDF document."
|
|
74
|
+
PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
|
|
75
|
+
PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
|
|
76
|
+
PDF_TABLE: str = "Structured table extracted from PDF document."
|
|
77
|
+
PDF_TEXT: str = "Unstructured text from PDF document."
|
|
78
|
+
PPTX_IMAGE: str = "Image extracted from PPTX presentation."
|
|
79
|
+
PPTX_TABLE: str = "Structured table extracted from PPTX presentation."
|
|
80
|
+
PPTX_TEXT: str = "Unstructured text from PPTX presentation."
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ContentTypeEnum(str, Enum):
|
|
84
|
+
"""
|
|
85
|
+
Enum for representing various content types.
|
|
86
|
+
|
|
87
|
+
Note: Content type declares the broad category of the content, such as text, image, audio, etc.
|
|
88
|
+
This is not equivalent to the Document type, which is a specific file format.
|
|
89
|
+
|
|
90
|
+
Attributes
|
|
91
|
+
----------
|
|
92
|
+
AUDIO : str
|
|
93
|
+
Represents audio content.
|
|
94
|
+
EMBEDDING : str
|
|
95
|
+
Represents embedding content.
|
|
96
|
+
IMAGE : str
|
|
97
|
+
Represents image content.
|
|
98
|
+
INFO_MSG : str
|
|
99
|
+
Represents an informational message.
|
|
100
|
+
PAGE_IMAGE : str
|
|
101
|
+
Represents a full-page image rendered from a document.
|
|
102
|
+
STRUCTURED : str
|
|
103
|
+
Represents structured content.
|
|
104
|
+
TEXT : str
|
|
105
|
+
Represents text content.
|
|
106
|
+
UNSTRUCTURED : str
|
|
107
|
+
Represents unstructured content.
|
|
108
|
+
VIDEO : str
|
|
109
|
+
Represents video content.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
AUDIO: str = "audio"
|
|
113
|
+
CHART: str = "chart"
|
|
114
|
+
EMBEDDING: str = "embedding"
|
|
115
|
+
IMAGE: str = "image"
|
|
116
|
+
INFOGRAPHIC: str = "infographic"
|
|
117
|
+
INFO_MSG: str = "info_message"
|
|
118
|
+
NONE: str = "none"
|
|
119
|
+
PAGE_IMAGE: str = "page_image"
|
|
120
|
+
STRUCTURED: str = "structured"
|
|
121
|
+
TABLE: str = "table"
|
|
122
|
+
TEXT: str = "text"
|
|
123
|
+
UNKNOWN: str = "unknown"
|
|
124
|
+
VIDEO: str = "video"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class DocumentTypeEnum(str, Enum):
|
|
128
|
+
"""
|
|
129
|
+
Enum for representing various document file types.
|
|
130
|
+
|
|
131
|
+
Note: Document type refers to the specific file format of the content, such as PDF, DOCX, etc.
|
|
132
|
+
This is not equivalent to the Content type, which is a broad category of the content.
|
|
133
|
+
|
|
134
|
+
Attributes
|
|
135
|
+
----------
|
|
136
|
+
BMP: str
|
|
137
|
+
BMP image format.
|
|
138
|
+
DOCX: str
|
|
139
|
+
Microsoft Word document format.
|
|
140
|
+
HTML: str
|
|
141
|
+
HTML document.
|
|
142
|
+
JPEG: str
|
|
143
|
+
JPEG image format.
|
|
144
|
+
PDF: str
|
|
145
|
+
PDF document format.
|
|
146
|
+
PNG: str
|
|
147
|
+
PNG image format.
|
|
148
|
+
PPTX: str
|
|
149
|
+
PowerPoint presentation format.
|
|
150
|
+
SVG: str
|
|
151
|
+
SVG image format.
|
|
152
|
+
TIFF: str
|
|
153
|
+
TIFF image format.
|
|
154
|
+
TXT: str
|
|
155
|
+
Plain text file.
|
|
156
|
+
MP3: str
|
|
157
|
+
MP3 audio format.
|
|
158
|
+
WAV: str
|
|
159
|
+
WAV audio format.
|
|
160
|
+
MP4: str
|
|
161
|
+
MP4 video format.
|
|
162
|
+
MOV: str
|
|
163
|
+
MOV video format.
|
|
164
|
+
AVI: str
|
|
165
|
+
AVI video format.
|
|
166
|
+
MKV: str
|
|
167
|
+
MKV video format.
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
BMP: str = "bmp"
|
|
171
|
+
DOCX: str = "docx"
|
|
172
|
+
HTML: str = "html"
|
|
173
|
+
JPEG: str = "jpeg"
|
|
174
|
+
PDF: str = "pdf"
|
|
175
|
+
PNG: str = "png"
|
|
176
|
+
PPTX: str = "pptx"
|
|
177
|
+
SVG: str = "svg"
|
|
178
|
+
TIFF: str = "tiff"
|
|
179
|
+
TXT: str = "text"
|
|
180
|
+
MD: str = "text"
|
|
181
|
+
MP3: str = "mp3"
|
|
182
|
+
WAV: str = "wav"
|
|
183
|
+
MP4: str = "mp4"
|
|
184
|
+
MOV: str = "mov"
|
|
185
|
+
AVI: str = "avi"
|
|
186
|
+
MKV: str = "mkv"
|
|
187
|
+
UNKNOWN: str = "unknown"
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class LanguageEnum(str, Enum):
|
|
191
|
+
"""
|
|
192
|
+
Enum for representing various language codes.
|
|
193
|
+
|
|
194
|
+
Attributes
|
|
195
|
+
----------
|
|
196
|
+
AF : str
|
|
197
|
+
Afrikaans language code.
|
|
198
|
+
AR : str
|
|
199
|
+
Arabic language code.
|
|
200
|
+
BG : str
|
|
201
|
+
Bulgarian language code.
|
|
202
|
+
BN : str
|
|
203
|
+
Bengali language code.
|
|
204
|
+
CA : str
|
|
205
|
+
Catalan language code.
|
|
206
|
+
CS : str
|
|
207
|
+
Czech language code.
|
|
208
|
+
CY : str
|
|
209
|
+
Welsh language code.
|
|
210
|
+
DA : str
|
|
211
|
+
Danish language code.
|
|
212
|
+
DE : str
|
|
213
|
+
German language code.
|
|
214
|
+
EL : str
|
|
215
|
+
Greek language code.
|
|
216
|
+
EN : str
|
|
217
|
+
English language code.
|
|
218
|
+
ES : str
|
|
219
|
+
Spanish language code.
|
|
220
|
+
ET : str
|
|
221
|
+
Estonian language code.
|
|
222
|
+
FA : str
|
|
223
|
+
Persian language code.
|
|
224
|
+
FI : str
|
|
225
|
+
Finnish language code.
|
|
226
|
+
FR : str
|
|
227
|
+
French language code.
|
|
228
|
+
GU : str
|
|
229
|
+
Gujarati language code.
|
|
230
|
+
HE : str
|
|
231
|
+
Hebrew language code.
|
|
232
|
+
HI : str
|
|
233
|
+
Hindi language code.
|
|
234
|
+
HR : str
|
|
235
|
+
Croatian language code.
|
|
236
|
+
HU : str
|
|
237
|
+
Hungarian language code.
|
|
238
|
+
ID : str
|
|
239
|
+
Indonesian language code.
|
|
240
|
+
IT : str
|
|
241
|
+
Italian language code.
|
|
242
|
+
JA : str
|
|
243
|
+
Japanese language code.
|
|
244
|
+
KN : str
|
|
245
|
+
Kannada language code.
|
|
246
|
+
KO : str
|
|
247
|
+
Korean language code.
|
|
248
|
+
LT : str
|
|
249
|
+
Lithuanian language code.
|
|
250
|
+
LV : str
|
|
251
|
+
Latvian language code.
|
|
252
|
+
MK : str
|
|
253
|
+
Macedonian language code.
|
|
254
|
+
ML : str
|
|
255
|
+
Malayalam language code.
|
|
256
|
+
MR : str
|
|
257
|
+
Marathi language code.
|
|
258
|
+
NE : str
|
|
259
|
+
Nepali language code.
|
|
260
|
+
NL : str
|
|
261
|
+
Dutch language code.
|
|
262
|
+
NO : str
|
|
263
|
+
Norwegian language code.
|
|
264
|
+
PA : str
|
|
265
|
+
Punjabi language code.
|
|
266
|
+
PL : str
|
|
267
|
+
Polish language code.
|
|
268
|
+
PT : str
|
|
269
|
+
Portuguese language code.
|
|
270
|
+
RO : str
|
|
271
|
+
Romanian language code.
|
|
272
|
+
RU : str
|
|
273
|
+
Russian language code.
|
|
274
|
+
SK : str
|
|
275
|
+
Slovak language code.
|
|
276
|
+
SL : str
|
|
277
|
+
Slovenian language code.
|
|
278
|
+
SO : str
|
|
279
|
+
Somali language code.
|
|
280
|
+
SQ : str
|
|
281
|
+
Albanian language code.
|
|
282
|
+
SV : str
|
|
283
|
+
Swedish language code.
|
|
284
|
+
SW : str
|
|
285
|
+
Swahili language code.
|
|
286
|
+
TA : str
|
|
287
|
+
Tamil language code.
|
|
288
|
+
TE : str
|
|
289
|
+
Telugu language code.
|
|
290
|
+
TH : str
|
|
291
|
+
Thai language code.
|
|
292
|
+
TL : str
|
|
293
|
+
Tagalog language code.
|
|
294
|
+
TR : str
|
|
295
|
+
Turkish language code.
|
|
296
|
+
UK : str
|
|
297
|
+
Ukrainian language code.
|
|
298
|
+
UR : str
|
|
299
|
+
Urdu language code.
|
|
300
|
+
VI : str
|
|
301
|
+
Vietnamese language code.
|
|
302
|
+
ZH_CN : str
|
|
303
|
+
Chinese (Simplified) language code.
|
|
304
|
+
ZH_TW : str
|
|
305
|
+
Chinese (Traditional) language code.
|
|
306
|
+
UNKNOWN : str
|
|
307
|
+
Represents an unknown language.
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
AF: str = "af"
|
|
311
|
+
AR: str = "ar"
|
|
312
|
+
BG: str = "bg"
|
|
313
|
+
BN: str = "bn"
|
|
314
|
+
CA: str = "ca"
|
|
315
|
+
CS: str = "cs"
|
|
316
|
+
CY: str = "cy"
|
|
317
|
+
DA: str = "da"
|
|
318
|
+
DE: str = "de"
|
|
319
|
+
EL: str = "el"
|
|
320
|
+
EN: str = "en"
|
|
321
|
+
ES: str = "es"
|
|
322
|
+
ET: str = "et"
|
|
323
|
+
FA: str = "fa"
|
|
324
|
+
FI: str = "fi"
|
|
325
|
+
FR: str = "fr"
|
|
326
|
+
GU: str = "gu"
|
|
327
|
+
HE: str = "he"
|
|
328
|
+
HI: str = "hi"
|
|
329
|
+
HR: str = "hr"
|
|
330
|
+
HU: str = "hu"
|
|
331
|
+
ID: str = "id"
|
|
332
|
+
IT: str = "it"
|
|
333
|
+
JA: str = "ja"
|
|
334
|
+
KN: str = "kn"
|
|
335
|
+
KO: str = "ko"
|
|
336
|
+
LT: str = "lt"
|
|
337
|
+
LV: str = "lv"
|
|
338
|
+
MK: str = "mk"
|
|
339
|
+
ML: str = "ml"
|
|
340
|
+
MR: str = "mr"
|
|
341
|
+
NE: str = "ne"
|
|
342
|
+
NL: str = "nl"
|
|
343
|
+
NO: str = "no"
|
|
344
|
+
PA: str = "pa"
|
|
345
|
+
PL: str = "pl"
|
|
346
|
+
PT: str = "pt"
|
|
347
|
+
RO: str = "ro"
|
|
348
|
+
RU: str = "ru"
|
|
349
|
+
SK: str = "sk"
|
|
350
|
+
SL: str = "sl"
|
|
351
|
+
SO: str = "so"
|
|
352
|
+
SQ: str = "sq"
|
|
353
|
+
SV: str = "sv"
|
|
354
|
+
SW: str = "sw"
|
|
355
|
+
TA: str = "ta"
|
|
356
|
+
TE: str = "te"
|
|
357
|
+
TH: str = "th"
|
|
358
|
+
TL: str = "tl"
|
|
359
|
+
TR: str = "tr"
|
|
360
|
+
UK: str = "uk"
|
|
361
|
+
UR: str = "ur"
|
|
362
|
+
VI: str = "vi"
|
|
363
|
+
ZH_CN: str = "zh-cn"
|
|
364
|
+
ZH_TW: str = "zh-tw"
|
|
365
|
+
UNKNOWN: str = "unknown"
|
|
366
|
+
|
|
367
|
+
@classmethod
|
|
368
|
+
def has_value(cls: Type["LanguageEnum"], value: Any) -> bool:
|
|
369
|
+
"""
|
|
370
|
+
Check if the enum contains the given value.
|
|
371
|
+
|
|
372
|
+
Parameters
|
|
373
|
+
----------
|
|
374
|
+
value : Any
|
|
375
|
+
The value to check against the enum members.
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
bool
|
|
380
|
+
True if the value exists in the enum, False otherwise.
|
|
381
|
+
"""
|
|
382
|
+
return value in cls._value2member_map_
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
class StatusEnum(str, Enum):
|
|
386
|
+
"""
|
|
387
|
+
Enum for representing status messages.
|
|
388
|
+
|
|
389
|
+
Attributes
|
|
390
|
+
----------
|
|
391
|
+
ERROR : str
|
|
392
|
+
Represents an error status.
|
|
393
|
+
SUCCESS : str
|
|
394
|
+
Represents a success status.
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
ERROR: str = "error"
|
|
398
|
+
SUCCESS: str = "success"
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class PipelinePhase(int, Enum):
|
|
402
|
+
"""
|
|
403
|
+
The logical phase of a pipeline stage.
|
|
404
|
+
|
|
405
|
+
Attributes
|
|
406
|
+
----------
|
|
407
|
+
PRE_PROCESSING : int
|
|
408
|
+
Pre-processing phase.
|
|
409
|
+
EXTRACTION : int
|
|
410
|
+
Extraction phase.
|
|
411
|
+
POST_PROCESSING : int
|
|
412
|
+
Post-processing phase.
|
|
413
|
+
MUTATION : int
|
|
414
|
+
Mutation phase.
|
|
415
|
+
TRANSFORM : int
|
|
416
|
+
Transform phase.
|
|
417
|
+
RESPONSE : int
|
|
418
|
+
Response phase.
|
|
419
|
+
TELEMETRY : int
|
|
420
|
+
Telemetry phase.
|
|
421
|
+
DRAIN : int
|
|
422
|
+
Drain phase.
|
|
423
|
+
"""
|
|
424
|
+
|
|
425
|
+
PRE_PROCESSING = 0
|
|
426
|
+
EXTRACTION = 1
|
|
427
|
+
POST_PROCESSING = 2
|
|
428
|
+
MUTATION = 3
|
|
429
|
+
TRANSFORM = 4
|
|
430
|
+
RESPONSE = 5
|
|
431
|
+
TELEMETRY = 6
|
|
432
|
+
DRAIN = 7
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class TableFormatEnum(str, Enum):
|
|
436
|
+
"""
|
|
437
|
+
Enum for representing table formats.
|
|
438
|
+
|
|
439
|
+
Attributes
|
|
440
|
+
----------
|
|
441
|
+
HTML : str
|
|
442
|
+
Represents HTML table format.
|
|
443
|
+
IMAGE : str
|
|
444
|
+
Represents image table format.
|
|
445
|
+
LATEX : str
|
|
446
|
+
Represents LaTeX table format.
|
|
447
|
+
MARKDOWN : str
|
|
448
|
+
Represents Markdown table format.
|
|
449
|
+
PSEUDO_MARKDOWN : str
|
|
450
|
+
Represents pseudo Markdown table format.
|
|
451
|
+
SIMPLE : str
|
|
452
|
+
Represents simple table format.
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
HTML: str = "html"
|
|
456
|
+
IMAGE: str = "image"
|
|
457
|
+
LATEX: str = "latex"
|
|
458
|
+
MARKDOWN: str = "markdown"
|
|
459
|
+
PSEUDO_MARKDOWN: str = "pseudo_markdown"
|
|
460
|
+
SIMPLE: str = "simple"
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class TaskTypeEnum(str, Enum):
|
|
464
|
+
"""
|
|
465
|
+
Enum for representing various task types.
|
|
466
|
+
|
|
467
|
+
Attributes
|
|
468
|
+
----------
|
|
469
|
+
CAPTION : str
|
|
470
|
+
Represents a caption task.
|
|
471
|
+
DEDUP : str
|
|
472
|
+
Represents a deduplication task.
|
|
473
|
+
EMBED : str
|
|
474
|
+
Represents an embedding task.
|
|
475
|
+
EXTRACT : str
|
|
476
|
+
Represents an extraction task.
|
|
477
|
+
FILTER : str
|
|
478
|
+
Represents a filtering task.
|
|
479
|
+
SPLIT : str
|
|
480
|
+
Represents a splitting task.
|
|
481
|
+
STORE : str
|
|
482
|
+
Represents a storing task.
|
|
483
|
+
STORE_EMBEDDING : str
|
|
484
|
+
Represents a task for storing embeddings.
|
|
485
|
+
VDB_UPLOAD : str
|
|
486
|
+
Represents a task for uploading to a vector database.
|
|
487
|
+
AUDIO_DATA_EXTRACT : str
|
|
488
|
+
Represents a task for extracting audio data.
|
|
489
|
+
TABLE_DATA_EXTRACT : str
|
|
490
|
+
Represents a task for extracting table data.
|
|
491
|
+
CHART_DATA_EXTRACT : str
|
|
492
|
+
Represents a task for extracting chart data.
|
|
493
|
+
INFOGRAPHIC_DATA_EXTRACT : str
|
|
494
|
+
Represents a task for extracting infographic data.
|
|
495
|
+
UDF : str
|
|
496
|
+
Represents a user-defined function task.
|
|
497
|
+
"""
|
|
498
|
+
|
|
499
|
+
AUDIO_DATA_EXTRACT: str = "audio_data_extract"
|
|
500
|
+
CAPTION: str = "caption"
|
|
501
|
+
CHART_DATA_EXTRACT: str = "chart_data_extract"
|
|
502
|
+
DEDUP: str = "dedup"
|
|
503
|
+
EMBED: str = "embed"
|
|
504
|
+
EXTRACT: str = "extract"
|
|
505
|
+
FILTER: str = "filter"
|
|
506
|
+
INFOGRAPHIC_DATA_EXTRACT: str = "infographic_data_extract"
|
|
507
|
+
OCR_DATA_EXTRACT: str = "ocr_data_extract"
|
|
508
|
+
SPLIT: str = "split"
|
|
509
|
+
STORE_EMBEDDING: str = "store_embedding"
|
|
510
|
+
STORE: str = "store"
|
|
511
|
+
TABLE_DATA_EXTRACT: str = "table_data_extract"
|
|
512
|
+
UDF: str = "udf"
|
|
513
|
+
VDB_UPLOAD: str = "vdb_upload"
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
class TextTypeEnum(str, Enum):
|
|
517
|
+
"""
|
|
518
|
+
Enum for representing different types of text segments.
|
|
519
|
+
|
|
520
|
+
Attributes
|
|
521
|
+
----------
|
|
522
|
+
BLOCK : str
|
|
523
|
+
Represents a text block.
|
|
524
|
+
BODY : str
|
|
525
|
+
Represents body text.
|
|
526
|
+
DOCUMENT : str
|
|
527
|
+
Represents an entire document.
|
|
528
|
+
HEADER : str
|
|
529
|
+
Represents a header text.
|
|
530
|
+
LINE : str
|
|
531
|
+
Represents a single line of text.
|
|
532
|
+
NEARBY_BLOCK : str
|
|
533
|
+
Represents a block of text in close proximity to another.
|
|
534
|
+
OTHER : str
|
|
535
|
+
Represents other unspecified text type.
|
|
536
|
+
PAGE : str
|
|
537
|
+
Represents a page of text.
|
|
538
|
+
SPAN : str
|
|
539
|
+
Represents an inline text span.
|
|
540
|
+
"""
|
|
541
|
+
|
|
542
|
+
BLOCK: str = "block"
|
|
543
|
+
BODY: str = "body"
|
|
544
|
+
DOCUMENT: str = "document"
|
|
545
|
+
HEADER: str = "header"
|
|
546
|
+
LINE: str = "line"
|
|
547
|
+
NEARBY_BLOCK: str = "nearby_block"
|
|
548
|
+
OTHER: str = "other"
|
|
549
|
+
PAGE: str = "page"
|
|
550
|
+
SPAN: str = "span"
|