nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.23.dev20250423.dist-info/RECORD +152 -0
  149. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/WHEEL +1 -1
  150. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  151. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,494 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from enum import Enum
8
+ from typing import Type, Any
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class AccessLevelEnum(int, Enum):
14
+ """
15
+ Note
16
+ ----
17
+ This is for future use, and currently has no functional use case.
18
+
19
+ Enum for representing different access levels.
20
+
21
+ Attributes
22
+ ----------
23
+ LEVEL_1 : int
24
+ Represents access level 1.
25
+ LEVEL_2 : int
26
+ Represents access level 2.
27
+ LEVEL_3 : int
28
+ Represents access level 3.
29
+ """
30
+
31
+ UNKNOWN: int = -1
32
+ LEVEL_1: int = 1
33
+ LEVEL_2: int = 2
34
+ LEVEL_3: int = 3
35
+
36
+
37
+ class ContentDescriptionEnum(str, Enum):
38
+ """
39
+ Enum for standard content descriptions extracted from different source types.
40
+
41
+ Attributes
42
+ ----------
43
+ DOCX_IMAGE : str
44
+ Description for image extracted from DOCX document.
45
+ DOCX_TABLE : str
46
+ Description for structured table extracted from DOCX document.
47
+ DOCX_TEXT : str
48
+ Description for unstructured text from DOCX document.
49
+ PDF_CHART : str
50
+ Description for structured chart extracted from PDF document.
51
+ PDF_IMAGE : str
52
+ Description for image extracted from PDF document.
53
+ PDF_INFOGRAPHIC : str
54
+ Description for structured infographic extracted from PDF document.
55
+ PDF_TABLE : str
56
+ Description for structured table extracted from PDF document.
57
+ PDF_TEXT : str
58
+ Description for unstructured text from PDF document.
59
+ PPTX_IMAGE : str
60
+ Description for image extracted from PPTX presentation.
61
+ PPTX_TABLE : str
62
+ Description for structured table extracted from PPTX presentation.
63
+ PPTX_TEXT : str
64
+ Description for unstructured text from PPTX presentation.
65
+ """
66
+
67
+ DOCX_IMAGE: str = "Image extracted from DOCX document."
68
+ DOCX_TABLE: str = "Structured table extracted from DOCX document."
69
+ DOCX_TEXT: str = "Unstructured text from DOCX document."
70
+ PDF_CHART: str = "Structured chart extracted from PDF document."
71
+ PDF_IMAGE: str = "Image extracted from PDF document."
72
+ PDF_INFOGRAPHIC: str = "Structured infographic extracted from PDF document."
73
+ PDF_TABLE: str = "Structured table extracted from PDF document."
74
+ PDF_TEXT: str = "Unstructured text from PDF document."
75
+ PPTX_IMAGE: str = "Image extracted from PPTX presentation."
76
+ PPTX_TABLE: str = "Structured table extracted from PPTX presentation."
77
+ PPTX_TEXT: str = "Unstructured text from PPTX presentation."
78
+
79
+
80
+ class ContentTypeEnum(str, Enum):
81
+ """
82
+ Enum for representing various content types.
83
+
84
+ Note: Content type declares the broad category of the content, such as text, image, audio, etc.
85
+ This is not equivalent to the Document type, which is a specific file format.
86
+
87
+ Attributes
88
+ ----------
89
+ AUDIO : str
90
+ Represents audio content.
91
+ EMBEDDING : str
92
+ Represents embedding content.
93
+ IMAGE : str
94
+ Represents image content.
95
+ INFO_MSG : str
96
+ Represents an informational message.
97
+ STRUCTURED : str
98
+ Represents structured content.
99
+ TEXT : str
100
+ Represents text content.
101
+ UNSTRUCTURED : str
102
+ Represents unstructured content.
103
+ VIDEO : str
104
+ Represents video content.
105
+ """
106
+
107
+ AUDIO: str = "audio"
108
+ CHART: str = "chart"
109
+ EMBEDDING: str = "embedding"
110
+ IMAGE: str = "image"
111
+ INFOGRAPHIC: str = "infographic"
112
+ INFO_MSG: str = "info_message"
113
+ NONE: str = "none"
114
+ STRUCTURED: str = "structured"
115
+ TABLE: str = "table"
116
+ TEXT: str = "text"
117
+ UNKNOWN: str = "unknown"
118
+ VIDEO: str = "video"
119
+
120
+
121
+ class DocumentTypeEnum(str, Enum):
122
+ """
123
+ Enum for representing various document file types.
124
+
125
+ Note: Document type refers to the specific file format of the content, such as PDF, DOCX, etc.
126
+ This is not equivalent to the Content type, which is a broad category of the content.
127
+
128
+ Attributes
129
+ ----------
130
+ BMP: str
131
+ BMP image format.
132
+ DOCX: str
133
+ Microsoft Word document format.
134
+ HTML: str
135
+ HTML document.
136
+ JPEG: str
137
+ JPEG image format.
138
+ PDF: str
139
+ PDF document format.
140
+ PNG: str
141
+ PNG image format.
142
+ PPTX: str
143
+ PowerPoint presentation format.
144
+ SVG: str
145
+ SVG image format.
146
+ TIFF: str
147
+ TIFF image format.
148
+ TXT: str
149
+ Plain text file.
150
+ MP3: str
151
+ MP3 audio format.
152
+ WAV: str
153
+ WAV audio format.
154
+ """
155
+
156
+ BMP: str = "bmp"
157
+ DOCX: str = "docx"
158
+ HTML: str = "html"
159
+ JPEG: str = "jpeg"
160
+ PDF: str = "pdf"
161
+ PNG: str = "png"
162
+ PPTX: str = "pptx"
163
+ SVG: str = "svg"
164
+ TIFF: str = "tiff"
165
+ TXT: str = "text"
166
+ MD: str = "text"
167
+ MP3: str = "mp3"
168
+ WAV: str = "wav"
169
+ UNKNOWN: str = "unknown"
170
+
171
+
172
+ class LanguageEnum(str, Enum):
173
+ """
174
+ Enum for representing various language codes.
175
+
176
+ Attributes
177
+ ----------
178
+ AF : str
179
+ Afrikaans language code.
180
+ AR : str
181
+ Arabic language code.
182
+ BG : str
183
+ Bulgarian language code.
184
+ BN : str
185
+ Bengali language code.
186
+ CA : str
187
+ Catalan language code.
188
+ CS : str
189
+ Czech language code.
190
+ CY : str
191
+ Welsh language code.
192
+ DA : str
193
+ Danish language code.
194
+ DE : str
195
+ German language code.
196
+ EL : str
197
+ Greek language code.
198
+ EN : str
199
+ English language code.
200
+ ES : str
201
+ Spanish language code.
202
+ ET : str
203
+ Estonian language code.
204
+ FA : str
205
+ Persian language code.
206
+ FI : str
207
+ Finnish language code.
208
+ FR : str
209
+ French language code.
210
+ GU : str
211
+ Gujarati language code.
212
+ HE : str
213
+ Hebrew language code.
214
+ HI : str
215
+ Hindi language code.
216
+ HR : str
217
+ Croatian language code.
218
+ HU : str
219
+ Hungarian language code.
220
+ ID : str
221
+ Indonesian language code.
222
+ IT : str
223
+ Italian language code.
224
+ JA : str
225
+ Japanese language code.
226
+ KN : str
227
+ Kannada language code.
228
+ KO : str
229
+ Korean language code.
230
+ LT : str
231
+ Lithuanian language code.
232
+ LV : str
233
+ Latvian language code.
234
+ MK : str
235
+ Macedonian language code.
236
+ ML : str
237
+ Malayalam language code.
238
+ MR : str
239
+ Marathi language code.
240
+ NE : str
241
+ Nepali language code.
242
+ NL : str
243
+ Dutch language code.
244
+ NO : str
245
+ Norwegian language code.
246
+ PA : str
247
+ Punjabi language code.
248
+ PL : str
249
+ Polish language code.
250
+ PT : str
251
+ Portuguese language code.
252
+ RO : str
253
+ Romanian language code.
254
+ RU : str
255
+ Russian language code.
256
+ SK : str
257
+ Slovak language code.
258
+ SL : str
259
+ Slovenian language code.
260
+ SO : str
261
+ Somali language code.
262
+ SQ : str
263
+ Albanian language code.
264
+ SV : str
265
+ Swedish language code.
266
+ SW : str
267
+ Swahili language code.
268
+ TA : str
269
+ Tamil language code.
270
+ TE : str
271
+ Telugu language code.
272
+ TH : str
273
+ Thai language code.
274
+ TL : str
275
+ Tagalog language code.
276
+ TR : str
277
+ Turkish language code.
278
+ UK : str
279
+ Ukrainian language code.
280
+ UR : str
281
+ Urdu language code.
282
+ VI : str
283
+ Vietnamese language code.
284
+ ZH_CN : str
285
+ Chinese (Simplified) language code.
286
+ ZH_TW : str
287
+ Chinese (Traditional) language code.
288
+ UNKNOWN : str
289
+ Represents an unknown language.
290
+ """
291
+
292
+ AF: str = "af"
293
+ AR: str = "ar"
294
+ BG: str = "bg"
295
+ BN: str = "bn"
296
+ CA: str = "ca"
297
+ CS: str = "cs"
298
+ CY: str = "cy"
299
+ DA: str = "da"
300
+ DE: str = "de"
301
+ EL: str = "el"
302
+ EN: str = "en"
303
+ ES: str = "es"
304
+ ET: str = "et"
305
+ FA: str = "fa"
306
+ FI: str = "fi"
307
+ FR: str = "fr"
308
+ GU: str = "gu"
309
+ HE: str = "he"
310
+ HI: str = "hi"
311
+ HR: str = "hr"
312
+ HU: str = "hu"
313
+ ID: str = "id"
314
+ IT: str = "it"
315
+ JA: str = "ja"
316
+ KN: str = "kn"
317
+ KO: str = "ko"
318
+ LT: str = "lt"
319
+ LV: str = "lv"
320
+ MK: str = "mk"
321
+ ML: str = "ml"
322
+ MR: str = "mr"
323
+ NE: str = "ne"
324
+ NL: str = "nl"
325
+ NO: str = "no"
326
+ PA: str = "pa"
327
+ PL: str = "pl"
328
+ PT: str = "pt"
329
+ RO: str = "ro"
330
+ RU: str = "ru"
331
+ SK: str = "sk"
332
+ SL: str = "sl"
333
+ SO: str = "so"
334
+ SQ: str = "sq"
335
+ SV: str = "sv"
336
+ SW: str = "sw"
337
+ TA: str = "ta"
338
+ TE: str = "te"
339
+ TH: str = "th"
340
+ TL: str = "tl"
341
+ TR: str = "tr"
342
+ UK: str = "uk"
343
+ UR: str = "ur"
344
+ VI: str = "vi"
345
+ ZH_CN: str = "zh-cn"
346
+ ZH_TW: str = "zh-tw"
347
+ UNKNOWN: str = "unknown"
348
+
349
+ @classmethod
350
+ def has_value(cls: Type["LanguageEnum"], value: Any) -> bool:
351
+ """
352
+ Check if the enum contains the given value.
353
+
354
+ Parameters
355
+ ----------
356
+ value : Any
357
+ The value to check against the enum members.
358
+
359
+ Returns
360
+ -------
361
+ bool
362
+ True if the value exists in the enum, False otherwise.
363
+ """
364
+ return value in cls._value2member_map_
365
+
366
+
367
+ class StatusEnum(str, Enum):
368
+ """
369
+ Enum for representing status messages.
370
+
371
+ Attributes
372
+ ----------
373
+ ERROR : str
374
+ Represents an error status.
375
+ SUCCESS : str
376
+ Represents a success status.
377
+ """
378
+
379
+ ERROR: str = "error"
380
+ SUCCESS: str = "success"
381
+
382
+
383
+ class TableFormatEnum(str, Enum):
384
+ """
385
+ Enum for representing table formats.
386
+
387
+ Attributes
388
+ ----------
389
+ HTML : str
390
+ Represents HTML table format.
391
+ IMAGE : str
392
+ Represents image table format.
393
+ LATEX : str
394
+ Represents LaTeX table format.
395
+ MARKDOWN : str
396
+ Represents Markdown table format.
397
+ PSEUDO_MARKDOWN : str
398
+ Represents pseudo Markdown table format.
399
+ SIMPLE : str
400
+ Represents simple table format.
401
+ """
402
+
403
+ HTML: str = "html"
404
+ IMAGE: str = "image"
405
+ LATEX: str = "latex"
406
+ MARKDOWN: str = "markdown"
407
+ PSEUDO_MARKDOWN: str = "pseudo_markdown"
408
+ SIMPLE: str = "simple"
409
+
410
+
411
+ class TaskTypeEnum(str, Enum):
412
+ """
413
+ Enum for representing various task types.
414
+
415
+ Attributes
416
+ ----------
417
+ CAPTION : str
418
+ Represents a caption task.
419
+ DEDUP : str
420
+ Represents a deduplication task.
421
+ EMBED : str
422
+ Represents an embedding task.
423
+ EXTRACT : str
424
+ Represents an extraction task.
425
+ FILTER : str
426
+ Represents a filtering task.
427
+ SPLIT : str
428
+ Represents a splitting task.
429
+ STORE : str
430
+ Represents a storing task.
431
+ STORE_EMBEDDING : str
432
+ Represents a task for storing embeddings.
433
+ VDB_UPLOAD : str
434
+ Represents a task for uploading to a vector database.
435
+ AUDIO_DATA_EXTRACT : str
436
+ Represents a task for extracting audio data.
437
+ TABLE_DATA_EXTRACT : str
438
+ Represents a task for extracting table data.
439
+ CHART_DATA_EXTRACT : str
440
+ Represents a task for extracting chart data.
441
+ INFOGRAPHIC_DATA_EXTRACT : str
442
+ Represents a task for extracting infographic data.
443
+ """
444
+
445
+ AUDIO_DATA_EXTRACT: str = "audio_data_extract"
446
+ CAPTION: str = "caption"
447
+ CHART_DATA_EXTRACT: str = "chart_data_extract"
448
+ DEDUP: str = "dedup"
449
+ EMBED: str = "embed"
450
+ EXTRACT: str = "extract"
451
+ FILTER: str = "filter"
452
+ INFOGRAPHIC_DATA_EXTRACT: str = "infographic_data_extract"
453
+ SPLIT: str = "split"
454
+ STORE_EMBEDDING: str = "store_embedding"
455
+ STORE: str = "store"
456
+ TABLE_DATA_EXTRACT: str = "table_data_extract"
457
+ VDB_UPLOAD: str = "vdb_upload"
458
+
459
+
460
+ class TextTypeEnum(str, Enum):
461
+ """
462
+ Enum for representing different types of text segments.
463
+
464
+ Attributes
465
+ ----------
466
+ BLOCK : str
467
+ Represents a text block.
468
+ BODY : str
469
+ Represents body text.
470
+ DOCUMENT : str
471
+ Represents an entire document.
472
+ HEADER : str
473
+ Represents a header text.
474
+ LINE : str
475
+ Represents a single line of text.
476
+ NEARBY_BLOCK : str
477
+ Represents a block of text in close proximity to another.
478
+ OTHER : str
479
+ Represents other unspecified text type.
480
+ PAGE : str
481
+ Represents a page of text.
482
+ SPAN : str
483
+ Represents an inline text span.
484
+ """
485
+
486
+ BLOCK: str = "block"
487
+ BODY: str = "body"
488
+ DOCUMENT: str = "document"
489
+ HEADER: str = "header"
490
+ LINE: str = "line"
491
+ NEARBY_BLOCK: str = "nearby_block"
492
+ OTHER: str = "other"
493
+ PAGE: str = "page"
494
+ SPAN: str = "span"
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,149 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+
7
+ import pandas as pd
8
+ from typing import Any
9
+ from typing import Dict
10
+ from typing import Optional
11
+ from typing import Tuple
12
+
13
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
14
+ from nv_ingest_api.internal.primitives.nim.model_interface.parakeet import create_audio_inference_client
15
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
16
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema, AudioMetadataSchema
17
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
18
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @unified_exception_handler
24
+ def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict) -> Dict:
25
+ """
26
+ Modifies the metadata of a row if the conditions for table extraction are met.
27
+
28
+ Parameters
29
+ ----------
30
+ row : pd.Series
31
+ A row from the DataFrame containing metadata for the audio extraction.
32
+
33
+ audio_client : Any
34
+ The client used to call the audio inference model.
35
+
36
+ trace_info : Dict
37
+ Trace information used for logging or debugging.
38
+
39
+ Returns
40
+ -------
41
+ Dict
42
+ The modified metadata if conditions are met, otherwise the original metadata.
43
+
44
+ Raises
45
+ ------
46
+ ValueError
47
+ If critical information (such as metadata) is missing from the row.
48
+ """
49
+
50
+ metadata = row.get("metadata")
51
+
52
+ if metadata is None:
53
+ logger.error("Row does not contain 'metadata'.")
54
+ raise ValueError("Row does not contain 'metadata'.")
55
+
56
+ base64_audio = metadata.pop("content")
57
+ content_metadata = metadata.get("content_metadata", {})
58
+
59
+ # Only modify if content type is audio
60
+ if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
61
+ return metadata
62
+
63
+ # Modify audio metadata with the result from the inference model
64
+ audio_result = audio_client.infer(
65
+ base64_audio,
66
+ model_name="parakeet",
67
+ trace_info=trace_info, # traceable_func arg
68
+ stage_name="audio_extraction",
69
+ )
70
+
71
+ row["document_type"] = ContentTypeEnum.AUDIO
72
+ audio_metadata = {"audio_transcript": audio_result}
73
+ metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
74
+ row["metadata"] = validate_schema(metadata, MetadataSchema).model_dump()
75
+
76
+ return metadata
77
+
78
+
79
+ def extract_text_from_audio_internal(
80
+ df_extraction_ledger: pd.DataFrame,
81
+ task_config: Dict[str, Any],
82
+ extraction_config: AudioExtractorSchema,
83
+ execution_trace_log: Optional[Dict] = None,
84
+ ) -> Tuple[pd.DataFrame, Dict]:
85
+ """
86
+ Extracts audio data from a DataFrame.
87
+
88
+ Parameters
89
+ ----------
90
+ df_extraction_ledger : pd.DataFrame
91
+ DataFrame containing the content from which audio data is to be extracted.
92
+
93
+ task_config : Dict[str, Any]
94
+ Dictionary containing task properties and configurations.
95
+
96
+ extraction_config : Any
97
+ The validated configuration object for audio extraction.
98
+
99
+ execution_trace_log : Optional[Dict], optional
100
+ Optional trace information for debugging or logging. Defaults to None.
101
+
102
+ Returns
103
+ -------
104
+ Tuple[pd.DataFrame, Dict]
105
+ A tuple containing the updated DataFrame and the trace information.
106
+
107
+ Raises
108
+ ------
109
+ Exception
110
+ If any error occurs during the audio data extraction process.
111
+ """
112
+ logger.debug(f"Entering audio extraction stage with {len(df_extraction_ledger)} rows.")
113
+
114
+ extract_params = task_config.get("params", {}).get("extract_audio_params", {})
115
+ audio_extraction_config = extraction_config.audio_extraction_config
116
+
117
+ grpc_endpoint = extract_params.get("grpc_endpoint") or audio_extraction_config.audio_endpoints[0]
118
+ http_endpoint = extract_params.get("http_endpoint") or audio_extraction_config.audio_endpoints[1]
119
+ infer_protocol = extract_params.get("infer_protocol") or audio_extraction_config.audio_infer_protocol
120
+ auth_token = extract_params.get("auth_token") or audio_extraction_config.auth_token
121
+ function_id = extract_params.get("function_id") or audio_extraction_config.function_id
122
+ use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
123
+ ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
124
+
125
+ parakeet_client = create_audio_inference_client(
126
+ (grpc_endpoint, http_endpoint),
127
+ infer_protocol=infer_protocol,
128
+ auth_token=auth_token,
129
+ function_id=function_id,
130
+ use_ssl=use_ssl,
131
+ ssl_cert=ssl_cert,
132
+ )
133
+
134
+ if execution_trace_log is None:
135
+ execution_trace_log = {}
136
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
137
+
138
+ try:
139
+ # Apply the _update_metadata function to each row in the DataFrame
140
+ df_extraction_ledger["metadata"] = df_extraction_ledger.apply(
141
+ _update_audio_metadata, axis=1, args=(parakeet_client, execution_trace_log)
142
+ )
143
+
144
+ return df_extraction_ledger, execution_trace_log
145
+
146
+ except Exception as e:
147
+ logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
148
+
149
+ raise
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.