nv-ingest-api 2025.3.27.dev20250327__py3-none-any.whl → 2025.3.28.dev20250328__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.3.28.dev20250328.dist-info/RECORD +9 -0
  6. nv_ingest_api/interface/__init__.py +0 -215
  7. nv_ingest_api/interface/extract.py +0 -972
  8. nv_ingest_api/interface/mutate.py +0 -154
  9. nv_ingest_api/interface/store.py +0 -218
  10. nv_ingest_api/interface/transform.py +0 -382
  11. nv_ingest_api/interface/utility.py +0 -200
  12. nv_ingest_api/internal/enums/__init__.py +0 -3
  13. nv_ingest_api/internal/enums/common.py +0 -494
  14. nv_ingest_api/internal/extract/__init__.py +0 -3
  15. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  17. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  18. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  19. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  20. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  23. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  24. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  25. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  26. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  27. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  28. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  29. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  30. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  34. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  40. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  44. nv_ingest_api/internal/mutate/__init__.py +0 -3
  45. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  46. nv_ingest_api/internal/mutate/filter.py +0 -133
  47. nv_ingest_api/internal/primitives/__init__.py +0 -0
  48. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  49. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  50. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  51. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  52. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  53. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  54. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -272
  55. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  56. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -452
  57. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  58. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  59. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  60. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  61. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  62. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  63. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  64. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  65. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  66. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  67. nv_ingest_api/internal/schemas/__init__.py +0 -3
  68. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  70. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  71. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  72. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  74. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  75. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  76. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  77. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  78. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  79. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  80. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  81. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  82. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  83. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  84. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  85. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  86. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  87. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  88. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  89. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  90. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  91. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  92. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  93. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  94. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  95. nv_ingest_api/internal/store/__init__.py +0 -3
  96. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  97. nv_ingest_api/internal/store/image_upload.py +0 -232
  98. nv_ingest_api/internal/transform/__init__.py +0 -3
  99. nv_ingest_api/internal/transform/caption_image.py +0 -205
  100. nv_ingest_api/internal/transform/embed_text.py +0 -496
  101. nv_ingest_api/internal/transform/split_text.py +0 -157
  102. nv_ingest_api/util/__init__.py +0 -0
  103. nv_ingest_api/util/control_message/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/validators.py +0 -47
  105. nv_ingest_api/util/converters/__init__.py +0 -0
  106. nv_ingest_api/util/converters/bytetools.py +0 -78
  107. nv_ingest_api/util/converters/containers.py +0 -65
  108. nv_ingest_api/util/converters/datetools.py +0 -90
  109. nv_ingest_api/util/converters/dftools.py +0 -127
  110. nv_ingest_api/util/converters/formats.py +0 -64
  111. nv_ingest_api/util/converters/type_mappings.py +0 -27
  112. nv_ingest_api/util/detectors/__init__.py +0 -5
  113. nv_ingest_api/util/detectors/language.py +0 -38
  114. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  115. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  116. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  117. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  118. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  119. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  120. nv_ingest_api/util/image_processing/__init__.py +0 -5
  121. nv_ingest_api/util/image_processing/clustering.py +0 -260
  122. nv_ingest_api/util/image_processing/processing.py +0 -179
  123. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  124. nv_ingest_api/util/image_processing/transforms.py +0 -407
  125. nv_ingest_api/util/logging/__init__.py +0 -0
  126. nv_ingest_api/util/logging/configuration.py +0 -31
  127. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  128. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  129. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  130. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  131. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  132. nv_ingest_api/util/metadata/__init__.py +0 -5
  133. nv_ingest_api/util/metadata/aggregators.py +0 -469
  134. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  135. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  136. nv_ingest_api/util/nim/__init__.py +0 -56
  137. nv_ingest_api/util/pdf/__init__.py +0 -3
  138. nv_ingest_api/util/pdf/pdfium.py +0 -427
  139. nv_ingest_api/util/schema/__init__.py +0 -0
  140. nv_ingest_api/util/schema/schema_validator.py +0 -10
  141. nv_ingest_api/util/service_clients/__init__.py +0 -3
  142. nv_ingest_api/util/service_clients/client_base.py +0 -72
  143. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  144. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  146. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  147. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -368
  148. nv_ingest_api/util/string_processing/__init__.py +0 -51
  149. nv_ingest_api-2025.3.27.dev20250327.dist-info/RECORD +0 -152
  150. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  151. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/top_level.txt +0 -0
@@ -1,146 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
- # Copyright (c) 2024, NVIDIA CORPORATION.
5
-
6
- import base64
7
- import io
8
-
9
- import pandas as pd
10
- from typing import Any, Dict, List, Optional
11
- import logging
12
-
13
- from nv_ingest_api.internal.extract.pdf.engines import (
14
- adobe_extractor,
15
- llama_parse_extractor,
16
- nemoretriever_parse_extractor,
17
- pdfium_extractor,
18
- tika_extractor,
19
- unstructured_io_extractor,
20
- )
21
- from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
22
-
23
- # Import extraction functions for different engines.
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
- # Lookup table mapping extraction method names to extractor functions.
28
- EXTRACTOR_LOOKUP = {
29
- "adobe": adobe_extractor,
30
- "llama": llama_parse_extractor,
31
- "nemoretriever_parse": nemoretriever_parse_extractor,
32
- "pdfium": pdfium_extractor,
33
- "tika": tika_extractor,
34
- "unstructured_io": unstructured_io_extractor,
35
- }
36
-
37
-
38
- def _work_extract_pdf(
39
- *,
40
- pdf_stream: io.BytesIO,
41
- extract_text: bool,
42
- extract_images: bool,
43
- extract_infographics: bool,
44
- extract_tables: bool,
45
- extract_charts: bool,
46
- extractor_config: dict,
47
- execution_trace_log=None,
48
- ) -> Any:
49
- """
50
- Perform PDF extraction on a decoded PDF stream using the given extraction parameters.
51
- """
52
-
53
- extract_method = extractor_config["extract_method"]
54
- extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
55
- return extractor_fn(
56
- pdf_stream,
57
- extract_text,
58
- extract_images,
59
- extract_infographics,
60
- extract_tables,
61
- extract_charts,
62
- extractor_config,
63
- execution_trace_log,
64
- )
65
-
66
-
67
- @unified_exception_handler
68
- def _orchestrate_row_extraction(
69
- row: pd.Series,
70
- task_config: Dict[str, Any],
71
- extractor_config: Any,
72
- execution_trace_log: Optional[List[Any]] = None,
73
- ) -> Any:
74
- """
75
- Orchestrate extraction for a single DataFrame row by decoding the PDF stream,
76
- building an extractor_config, and then delegating to the work function.
77
- """
78
- if "content" not in row:
79
- err_msg = f"Missing 'content' key in row: {row}"
80
- logger.error(err_msg)
81
- raise KeyError(err_msg)
82
-
83
- try:
84
- pdf_stream = io.BytesIO(base64.b64decode(row["content"]))
85
- except Exception as e:
86
- err_msg = f"Error decoding base64 content: {e}"
87
- logger.error(err_msg, exc_info=True)
88
- raise type(e)(err_msg) from e
89
-
90
- # Begin with a copy of the task parameters.
91
- params = task_config.get("params", {}).copy()
92
-
93
- # Extract required boolean flags from params.
94
- try:
95
- extract_text = params.pop("extract_text", False)
96
- extract_images = params.pop("extract_images", False)
97
- extract_tables = params.pop("extract_tables", False)
98
- extract_charts = params.pop("extract_charts", False)
99
- extract_infographics = params.pop("extract_infographics", False)
100
- extract_method = params.get("extract_method", "pdfium")
101
- except KeyError as e:
102
- raise ValueError(f"Missing required extraction flag: {e}")
103
-
104
- # Add row metadata (all columns except 'content') into the config.
105
- row_metadata = row.drop("content")
106
- params["row_data"] = row_metadata
107
-
108
- extract_method = task_config.get("method", extract_method)
109
- params["extract_method"] = extract_method
110
-
111
- # Construct the config key based on the extraction method
112
- config_key = f"{extract_method}_config"
113
-
114
- # Handle both object and dictionary cases for extractor_config
115
- if hasattr(extractor_config, config_key):
116
- # Object case: extractor_config is a Pydantic model with attribute access
117
- method_config = getattr(extractor_config, config_key)
118
- elif isinstance(extractor_config, dict) and config_key in extractor_config:
119
- # Dictionary case: extractor_config is a dict with key access
120
- method_config = extractor_config[config_key]
121
- else:
122
- # If no matching config is found, log a warning but don't fail
123
- logger.warning(f"No {config_key} found in extractor_config: {extractor_config}")
124
- method_config = None
125
-
126
- # Add the method-specific config to the parameters if available
127
- if method_config is not None:
128
- params[config_key] = method_config
129
- logger.debug(f"Added {config_key} to extraction parameters")
130
-
131
- # The resulting parameters constitute the complete extractor_config
132
- extractor_config = params
133
- logger.debug(f"Final extractor_config: {extractor_config}")
134
-
135
- result = _work_extract_pdf(
136
- pdf_stream=pdf_stream,
137
- extract_text=extract_text,
138
- extract_images=extract_images,
139
- extract_infographics=extract_infographics,
140
- extract_tables=extract_tables,
141
- extract_charts=extract_charts,
142
- extractor_config=extractor_config,
143
- execution_trace_log=execution_trace_log,
144
- )
145
-
146
- return result