nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,166 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+
6
+ import base64
7
+ import inspect
8
+ import io
9
+ import logging
10
+ from typing import Any
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
14
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
15
+
16
+ import pandas as pd
17
+ from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
18
+ from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
19
+ from nv_ingest_api.internal.extract.pdf.engines import nemotron_parse_extractor
20
+ from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
21
+ from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
22
+ from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
23
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
24
+
25
+ # Import extraction functions for different engines.
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Lookup table mapping extraction method names to extractor functions.
30
+ EXTRACTOR_LOOKUP = {
31
+ "adobe": adobe_extractor,
32
+ "llama": llama_parse_extractor,
33
+ "nemotron_parse": nemotron_parse_extractor,
34
+ "pdfium": pdfium_extractor,
35
+ "pdfium_hybrid": pdfium_extractor, # Uses pdfium for native text and switches to OCR pipeline only for scanned pages. # noqa: E501
36
+ "tika": tika_extractor,
37
+ "unstructured_io": unstructured_io_extractor,
38
+ "ocr": pdfium_extractor, # Ignores pdfium's text entirely and processes every single page through the full OCR pipline. # noqa: E501
39
+ }
40
+
41
+ METHOD_TO_CONFIG_KEY_MAP = {
42
+ "pdfium_hybrid": "pdfium_config",
43
+ "ocr": "pdfium_config",
44
+ }
45
+
46
+
47
+ def _work_extract_pdf(
48
+ *,
49
+ pdf_stream: io.BytesIO,
50
+ extract_text: bool,
51
+ extract_images: bool,
52
+ extract_infographics: bool,
53
+ extract_tables: bool,
54
+ extract_charts: bool,
55
+ extract_page_as_image: bool,
56
+ extractor_config: dict,
57
+ execution_trace_log=None,
58
+ ) -> Any:
59
+ """
60
+ Perform PDF extraction on a decoded PDF stream using the given extraction parameters.
61
+ """
62
+
63
+ extract_method = extractor_config["extract_method"]
64
+ extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
65
+
66
+ extractor_fn_args = dict(
67
+ pdf_stream=pdf_stream,
68
+ extract_text=extract_text,
69
+ extract_images=extract_images,
70
+ extract_infographics=extract_infographics,
71
+ extract_tables=extract_tables,
72
+ extract_charts=extract_charts,
73
+ extractor_config=extractor_config,
74
+ execution_trace_log=execution_trace_log,
75
+ )
76
+
77
+ if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
78
+ extractor_fn_args["extract_page_as_image"] = extract_page_as_image
79
+ elif extract_page_as_image:
80
+ logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
81
+
82
+ return extractor_fn(**extractor_fn_args)
83
+
84
+
85
+ @unified_exception_handler
86
+ def _orchestrate_row_extraction(
87
+ row: pd.Series,
88
+ task_config: Dict[str, Any],
89
+ extractor_config: Any,
90
+ execution_trace_log: Optional[List[Any]] = None,
91
+ ) -> Any:
92
+ """
93
+ Orchestrate extraction for a single DataFrame row by decoding the PDF stream,
94
+ building an extractor_config, and then delegating to the work function.
95
+ """
96
+ if "content" not in row:
97
+ err_msg = f"Missing 'content' key in row: {row}"
98
+ logger.error(err_msg)
99
+ raise KeyError(err_msg)
100
+
101
+ try:
102
+ pdf_stream = io.BytesIO(base64.b64decode(row["content"]))
103
+ except Exception as e:
104
+ err_msg = f"Error decoding base64 content: {e}"
105
+ logger.error(err_msg, exc_info=True)
106
+ raise type(e)(err_msg) from e
107
+
108
+ # Begin with a copy of the task parameters.
109
+ params = task_config.get("params", {}).copy()
110
+
111
+ # Extract required boolean flags from params.
112
+ try:
113
+ extract_text = params.pop("extract_text", False)
114
+ extract_images = params.pop("extract_images", False)
115
+ extract_tables = params.pop("extract_tables", False)
116
+ extract_charts = params.pop("extract_charts", False)
117
+ extract_infographics = params.pop("extract_infographics", False)
118
+ extract_page_as_image = params.pop("extract_page_as_image", False)
119
+ extract_method = params.get("extract_method", "pdfium")
120
+ except KeyError as e:
121
+ raise ValueError(f"Missing required extraction flag: {e}")
122
+
123
+ # Add row metadata (all columns except 'content') into the config.
124
+ row_metadata = row.drop("content")
125
+ params["row_data"] = row_metadata
126
+
127
+ extract_method = task_config.get("method", extract_method)
128
+ params["extract_method"] = extract_method
129
+
130
+ # Construct the config key based on the extraction method
131
+ config_key = METHOD_TO_CONFIG_KEY_MAP.get(extract_method, f"{extract_method}_config")
132
+
133
+ # Handle both object and dictionary cases for extractor_config
134
+ if hasattr(extractor_config, config_key):
135
+ # Object case: extractor_config is a Pydantic model with attribute access
136
+ method_config = getattr(extractor_config, config_key)
137
+ elif isinstance(extractor_config, dict) and config_key in extractor_config:
138
+ # Dictionary case: extractor_config is a dict with key access
139
+ method_config = extractor_config[config_key]
140
+ else:
141
+ # If no matching config is found, log a warning but don't fail
142
+ logger.warning(f"No {config_key} found in extractor_config: {sanitize_for_logging(extractor_config)}")
143
+ method_config = None
144
+
145
+ # Add the method-specific config to the parameters if available
146
+ if method_config is not None:
147
+ params[config_key] = method_config
148
+ logger.debug(f"Added {config_key} to extraction parameters")
149
+
150
+ # The resulting parameters constitute the complete extractor_config
151
+ extractor_config = params
152
+ logger.debug(f"Final extractor_config: {sanitize_for_logging(extractor_config)}")
153
+
154
+ result = _work_extract_pdf(
155
+ pdf_stream=pdf_stream,
156
+ extract_text=extract_text,
157
+ extract_images=extract_images,
158
+ extract_infographics=extract_infographics,
159
+ extract_page_as_image=extract_page_as_image,
160
+ extract_tables=extract_tables,
161
+ extract_charts=extract_charts,
162
+ extractor_config=extractor_config,
163
+ execution_trace_log=execution_trace_log,
164
+ )
165
+
166
+ return result