nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,550 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from enum import Enum
8
+ from typing import Type, Any
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class AccessLevelEnum(int, Enum):
14
+ """
15
+ Note
16
+ ----
17
+ This is for future use, and currently has no functional use case.
18
+
19
+ Enum for representing different access levels.
20
+
21
+ Attributes
22
+ ----------
23
+ LEVEL_1 : int
24
+ Represents access level 1.
25
+ LEVEL_2 : int
26
+ Represents access level 2.
27
+ LEVEL_3 : int
28
+ Represents access level 3.
29
+ """
30
+
31
+ UNKNOWN: int = -1
32
+ LEVEL_1: int = 1
33
+ LEVEL_2: int = 2
34
+ LEVEL_3: int = 3
35
+
36
+
37
+ class ContentDescriptionEnum(str, Enum):
38
+ """
39
+ Enum for standard content descriptions extracted from different source types.
40
+
41
+ Attributes
42
+ ----------
43
+ DOCX_IMAGE : str
44
+ Description for image extracted from DOCX document.
45
+ DOCX_TABLE : str
46
+ Description for structured table extracted from DOCX document.
47
+ DOCX_TEXT : str
48
+ Description for unstructured text from DOCX document.
49
+ PDF_CHART : str
50
+ Description for structured chart extracted from PDF document.
51
+ PDF_IMAGE : str
52
+ Description for image extracted from PDF document.
53
+ PDF_INFOGRAPHIC : str
54
+ Description for structured infographic extracted from PDF document.
55
+ PDF_PAGE_IMAGE : str
56
+ Description for a full-page image rendered from a PDF document.
57
+ PDF_TABLE : str
58
+ Description for structured table extracted from PDF document.
59
+ PDF_TEXT : str
60
+ Description for unstructured text from PDF document.
61
+ PPTX_IMAGE : str
62
+ Description for image extracted from PPTX presentation.
63
+ PPTX_TABLE : str
64
+ Description for structured table extracted from PPTX presentation.
65
+ PPTX_TEXT : str
66
+ Description for unstructured text from PPTX presentation.
67
+ """
68
+
69
+ DOCX_IMAGE: str = "Image extracted from DOCX document."
70
+ DOCX_TABLE: str = "Structured table extracted from DOCX document."
71
+ DOCX_TEXT: str = "Unstructured text from DOCX document."
72
+ PDF_CHART: str = "Structured chart extracted from PDF document."
73
+ PDF_IMAGE: str = "Image extracted from PDF document."
74
+ PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
75
+ PDF_PAGE_IMAGE: str = "Full-page image rendered from a PDF document."
76
+ PDF_TABLE: str = "Structured table extracted from PDF document."
77
+ PDF_TEXT: str = "Unstructured text from PDF document."
78
+ PPTX_IMAGE: str = "Image extracted from PPTX presentation."
79
+ PPTX_TABLE: str = "Structured table extracted from PPTX presentation."
80
+ PPTX_TEXT: str = "Unstructured text from PPTX presentation."
81
+
82
+
83
+ class ContentTypeEnum(str, Enum):
84
+ """
85
+ Enum for representing various content types.
86
+
87
+ Note: Content type declares the broad category of the content, such as text, image, audio, etc.
88
+ This is not equivalent to the Document type, which is a specific file format.
89
+
90
+ Attributes
91
+ ----------
92
+ AUDIO : str
93
+ Represents audio content.
94
+ EMBEDDING : str
95
+ Represents embedding content.
96
+ IMAGE : str
97
+ Represents image content.
98
+ INFO_MSG : str
99
+ Represents an informational message.
100
+ PAGE_IMAGE : str
101
+ Represents a full-page image rendered from a document.
102
+ STRUCTURED : str
103
+ Represents structured content.
104
+ TEXT : str
105
+ Represents text content.
106
+ UNSTRUCTURED : str
107
+ Represents unstructured content.
108
+ VIDEO : str
109
+ Represents video content.
110
+ """
111
+
112
+ AUDIO: str = "audio"
113
+ CHART: str = "chart"
114
+ EMBEDDING: str = "embedding"
115
+ IMAGE: str = "image"
116
+ INFOGRAPHIC: str = "infographic"
117
+ INFO_MSG: str = "info_message"
118
+ NONE: str = "none"
119
+ PAGE_IMAGE: str = "page_image"
120
+ STRUCTURED: str = "structured"
121
+ TABLE: str = "table"
122
+ TEXT: str = "text"
123
+ UNKNOWN: str = "unknown"
124
+ VIDEO: str = "video"
125
+
126
+
127
+ class DocumentTypeEnum(str, Enum):
128
+ """
129
+ Enum for representing various document file types.
130
+
131
+ Note: Document type refers to the specific file format of the content, such as PDF, DOCX, etc.
132
+ This is not equivalent to the Content type, which is a broad category of the content.
133
+
134
+ Attributes
135
+ ----------
136
+ BMP: str
137
+ BMP image format.
138
+ DOCX: str
139
+ Microsoft Word document format.
140
+ HTML: str
141
+ HTML document.
142
+ JPEG: str
143
+ JPEG image format.
144
+ PDF: str
145
+ PDF document format.
146
+ PNG: str
147
+ PNG image format.
148
+ PPTX: str
149
+ PowerPoint presentation format.
150
+ SVG: str
151
+ SVG image format.
152
+ TIFF: str
153
+ TIFF image format.
154
+ TXT: str
155
+ Plain text file.
156
+ MP3: str
157
+ MP3 audio format.
158
+ WAV: str
159
+ WAV audio format.
160
+ MP4: str
161
+ MP4 video format.
162
+ MOV: str
163
+ MOV video format.
164
+ AVI: str
165
+ AVI video format.
166
+ MKV: str
167
+ MKV video format.
168
+ """
169
+
170
+ BMP: str = "bmp"
171
+ DOCX: str = "docx"
172
+ HTML: str = "html"
173
+ JPEG: str = "jpeg"
174
+ PDF: str = "pdf"
175
+ PNG: str = "png"
176
+ PPTX: str = "pptx"
177
+ SVG: str = "svg"
178
+ TIFF: str = "tiff"
179
+ TXT: str = "text"
180
+ MD: str = "text"
181
+ MP3: str = "mp3"
182
+ WAV: str = "wav"
183
+ MP4: str = "mp4"
184
+ MOV: str = "mov"
185
+ AVI: str = "avi"
186
+ MKV: str = "mkv"
187
+ UNKNOWN: str = "unknown"
188
+
189
+
190
+ class LanguageEnum(str, Enum):
191
+ """
192
+ Enum for representing various language codes.
193
+
194
+ Attributes
195
+ ----------
196
+ AF : str
197
+ Afrikaans language code.
198
+ AR : str
199
+ Arabic language code.
200
+ BG : str
201
+ Bulgarian language code.
202
+ BN : str
203
+ Bengali language code.
204
+ CA : str
205
+ Catalan language code.
206
+ CS : str
207
+ Czech language code.
208
+ CY : str
209
+ Welsh language code.
210
+ DA : str
211
+ Danish language code.
212
+ DE : str
213
+ German language code.
214
+ EL : str
215
+ Greek language code.
216
+ EN : str
217
+ English language code.
218
+ ES : str
219
+ Spanish language code.
220
+ ET : str
221
+ Estonian language code.
222
+ FA : str
223
+ Persian language code.
224
+ FI : str
225
+ Finnish language code.
226
+ FR : str
227
+ French language code.
228
+ GU : str
229
+ Gujarati language code.
230
+ HE : str
231
+ Hebrew language code.
232
+ HI : str
233
+ Hindi language code.
234
+ HR : str
235
+ Croatian language code.
236
+ HU : str
237
+ Hungarian language code.
238
+ ID : str
239
+ Indonesian language code.
240
+ IT : str
241
+ Italian language code.
242
+ JA : str
243
+ Japanese language code.
244
+ KN : str
245
+ Kannada language code.
246
+ KO : str
247
+ Korean language code.
248
+ LT : str
249
+ Lithuanian language code.
250
+ LV : str
251
+ Latvian language code.
252
+ MK : str
253
+ Macedonian language code.
254
+ ML : str
255
+ Malayalam language code.
256
+ MR : str
257
+ Marathi language code.
258
+ NE : str
259
+ Nepali language code.
260
+ NL : str
261
+ Dutch language code.
262
+ NO : str
263
+ Norwegian language code.
264
+ PA : str
265
+ Punjabi language code.
266
+ PL : str
267
+ Polish language code.
268
+ PT : str
269
+ Portuguese language code.
270
+ RO : str
271
+ Romanian language code.
272
+ RU : str
273
+ Russian language code.
274
+ SK : str
275
+ Slovak language code.
276
+ SL : str
277
+ Slovenian language code.
278
+ SO : str
279
+ Somali language code.
280
+ SQ : str
281
+ Albanian language code.
282
+ SV : str
283
+ Swedish language code.
284
+ SW : str
285
+ Swahili language code.
286
+ TA : str
287
+ Tamil language code.
288
+ TE : str
289
+ Telugu language code.
290
+ TH : str
291
+ Thai language code.
292
+ TL : str
293
+ Tagalog language code.
294
+ TR : str
295
+ Turkish language code.
296
+ UK : str
297
+ Ukrainian language code.
298
+ UR : str
299
+ Urdu language code.
300
+ VI : str
301
+ Vietnamese language code.
302
+ ZH_CN : str
303
+ Chinese (Simplified) language code.
304
+ ZH_TW : str
305
+ Chinese (Traditional) language code.
306
+ UNKNOWN : str
307
+ Represents an unknown language.
308
+ """
309
+
310
+ AF: str = "af"
311
+ AR: str = "ar"
312
+ BG: str = "bg"
313
+ BN: str = "bn"
314
+ CA: str = "ca"
315
+ CS: str = "cs"
316
+ CY: str = "cy"
317
+ DA: str = "da"
318
+ DE: str = "de"
319
+ EL: str = "el"
320
+ EN: str = "en"
321
+ ES: str = "es"
322
+ ET: str = "et"
323
+ FA: str = "fa"
324
+ FI: str = "fi"
325
+ FR: str = "fr"
326
+ GU: str = "gu"
327
+ HE: str = "he"
328
+ HI: str = "hi"
329
+ HR: str = "hr"
330
+ HU: str = "hu"
331
+ ID: str = "id"
332
+ IT: str = "it"
333
+ JA: str = "ja"
334
+ KN: str = "kn"
335
+ KO: str = "ko"
336
+ LT: str = "lt"
337
+ LV: str = "lv"
338
+ MK: str = "mk"
339
+ ML: str = "ml"
340
+ MR: str = "mr"
341
+ NE: str = "ne"
342
+ NL: str = "nl"
343
+ NO: str = "no"
344
+ PA: str = "pa"
345
+ PL: str = "pl"
346
+ PT: str = "pt"
347
+ RO: str = "ro"
348
+ RU: str = "ru"
349
+ SK: str = "sk"
350
+ SL: str = "sl"
351
+ SO: str = "so"
352
+ SQ: str = "sq"
353
+ SV: str = "sv"
354
+ SW: str = "sw"
355
+ TA: str = "ta"
356
+ TE: str = "te"
357
+ TH: str = "th"
358
+ TL: str = "tl"
359
+ TR: str = "tr"
360
+ UK: str = "uk"
361
+ UR: str = "ur"
362
+ VI: str = "vi"
363
+ ZH_CN: str = "zh-cn"
364
+ ZH_TW: str = "zh-tw"
365
+ UNKNOWN: str = "unknown"
366
+
367
+ @classmethod
368
+ def has_value(cls: Type["LanguageEnum"], value: Any) -> bool:
369
+ """
370
+ Check if the enum contains the given value.
371
+
372
+ Parameters
373
+ ----------
374
+ value : Any
375
+ The value to check against the enum members.
376
+
377
+ Returns
378
+ -------
379
+ bool
380
+ True if the value exists in the enum, False otherwise.
381
+ """
382
+ return value in cls._value2member_map_
383
+
384
+
385
+ class StatusEnum(str, Enum):
386
+ """
387
+ Enum for representing status messages.
388
+
389
+ Attributes
390
+ ----------
391
+ ERROR : str
392
+ Represents an error status.
393
+ SUCCESS : str
394
+ Represents a success status.
395
+ """
396
+
397
+ ERROR: str = "error"
398
+ SUCCESS: str = "success"
399
+
400
+
401
+ class PipelinePhase(int, Enum):
402
+ """
403
+ The logical phase of a pipeline stage.
404
+
405
+ Attributes
406
+ ----------
407
+ PRE_PROCESSING : int
408
+ Pre-processing phase.
409
+ EXTRACTION : int
410
+ Extraction phase.
411
+ POST_PROCESSING : int
412
+ Post-processing phase.
413
+ MUTATION : int
414
+ Mutation phase.
415
+ TRANSFORM : int
416
+ Transform phase.
417
+ RESPONSE : int
418
+ Response phase.
419
+ TELEMETRY : int
420
+ Telemetry phase.
421
+ DRAIN : int
422
+ Drain phase.
423
+ """
424
+
425
+ PRE_PROCESSING = 0
426
+ EXTRACTION = 1
427
+ POST_PROCESSING = 2
428
+ MUTATION = 3
429
+ TRANSFORM = 4
430
+ RESPONSE = 5
431
+ TELEMETRY = 6
432
+ DRAIN = 7
433
+
434
+
435
+ class TableFormatEnum(str, Enum):
436
+ """
437
+ Enum for representing table formats.
438
+
439
+ Attributes
440
+ ----------
441
+ HTML : str
442
+ Represents HTML table format.
443
+ IMAGE : str
444
+ Represents image table format.
445
+ LATEX : str
446
+ Represents LaTeX table format.
447
+ MARKDOWN : str
448
+ Represents Markdown table format.
449
+ PSEUDO_MARKDOWN : str
450
+ Represents pseudo Markdown table format.
451
+ SIMPLE : str
452
+ Represents simple table format.
453
+ """
454
+
455
+ HTML: str = "html"
456
+ IMAGE: str = "image"
457
+ LATEX: str = "latex"
458
+ MARKDOWN: str = "markdown"
459
+ PSEUDO_MARKDOWN: str = "pseudo_markdown"
460
+ SIMPLE: str = "simple"
461
+
462
+
463
+ class TaskTypeEnum(str, Enum):
464
+ """
465
+ Enum for representing various task types.
466
+
467
+ Attributes
468
+ ----------
469
+ CAPTION : str
470
+ Represents a caption task.
471
+ DEDUP : str
472
+ Represents a deduplication task.
473
+ EMBED : str
474
+ Represents an embedding task.
475
+ EXTRACT : str
476
+ Represents an extraction task.
477
+ FILTER : str
478
+ Represents a filtering task.
479
+ SPLIT : str
480
+ Represents a splitting task.
481
+ STORE : str
482
+ Represents a storing task.
483
+ STORE_EMBEDDING : str
484
+ Represents a task for storing embeddings.
485
+ VDB_UPLOAD : str
486
+ Represents a task for uploading to a vector database.
487
+ AUDIO_DATA_EXTRACT : str
488
+ Represents a task for extracting audio data.
489
+ TABLE_DATA_EXTRACT : str
490
+ Represents a task for extracting table data.
491
+ CHART_DATA_EXTRACT : str
492
+ Represents a task for extracting chart data.
493
+ INFOGRAPHIC_DATA_EXTRACT : str
494
+ Represents a task for extracting infographic data.
495
+ UDF : str
496
+ Represents a user-defined function task.
497
+ """
498
+
499
+ AUDIO_DATA_EXTRACT: str = "audio_data_extract"
500
+ CAPTION: str = "caption"
501
+ CHART_DATA_EXTRACT: str = "chart_data_extract"
502
+ DEDUP: str = "dedup"
503
+ EMBED: str = "embed"
504
+ EXTRACT: str = "extract"
505
+ FILTER: str = "filter"
506
+ INFOGRAPHIC_DATA_EXTRACT: str = "infographic_data_extract"
507
+ OCR_DATA_EXTRACT: str = "ocr_data_extract"
508
+ SPLIT: str = "split"
509
+ STORE_EMBEDDING: str = "store_embedding"
510
+ STORE: str = "store"
511
+ TABLE_DATA_EXTRACT: str = "table_data_extract"
512
+ UDF: str = "udf"
513
+ VDB_UPLOAD: str = "vdb_upload"
514
+
515
+
516
+ class TextTypeEnum(str, Enum):
517
+ """
518
+ Enum for representing different types of text segments.
519
+
520
+ Attributes
521
+ ----------
522
+ BLOCK : str
523
+ Represents a text block.
524
+ BODY : str
525
+ Represents body text.
526
+ DOCUMENT : str
527
+ Represents an entire document.
528
+ HEADER : str
529
+ Represents a header text.
530
+ LINE : str
531
+ Represents a single line of text.
532
+ NEARBY_BLOCK : str
533
+ Represents a block of text in close proximity to another.
534
+ OTHER : str
535
+ Represents other unspecified text type.
536
+ PAGE : str
537
+ Represents a page of text.
538
+ SPAN : str
539
+ Represents an inline text span.
540
+ """
541
+
542
+ BLOCK: str = "block"
543
+ BODY: str = "body"
544
+ DOCUMENT: str = "document"
545
+ HEADER: str = "header"
546
+ LINE: str = "line"
547
+ NEARBY_BLOCK: str = "nearby_block"
548
+ OTHER: str = "other"
549
+ PAGE: str = "page"
550
+ SPAN: str = "span"
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0