nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.20.dev20250420__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/top_level.txt +0 -0
@@ -1,157 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import os
7
- import copy
8
- import logging
9
- import uuid
10
- from typing import Any, Optional, Dict
11
- from typing import List
12
-
13
- import pandas as pd
14
- from transformers import AutoTokenizer
15
-
16
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
17
- from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
18
- from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
24
- """Build documents from text chunks"""
25
- documents: List[dict] = []
26
-
27
- for i, text in enumerate(chunks):
28
- if text is None or not text.strip():
29
- continue
30
-
31
- metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
32
- metadata = copy.deepcopy(metadata)
33
-
34
- metadata["content"] = text
35
-
36
- documents.append({"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())})
37
-
38
- return documents
39
-
40
-
41
- def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20):
42
- # Tokenize the text into token IDs
43
- encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
44
-
45
- # Get the token IDs and offsets for splitting
46
- offsets = encoding["offset_mapping"]
47
-
48
- # Split the tokens into chunks of the desired size with the desired overlap
49
- chunks = [offsets[i : i + chunk_size] for i in range(0, len(offsets), chunk_size - chunk_overlap)]
50
-
51
- # Convert token chunks back to text while preserving original spacing and case
52
- text_chunks = []
53
- for chunk in chunks:
54
- text_chunk = text[chunk[0][0] : chunk[-1][0]]
55
- text_chunks.append(text_chunk)
56
-
57
- return text_chunks
58
-
59
-
60
- @unified_exception_handler
61
- def transform_text_split_and_tokenize_internal(
62
- df_transform_ledger: pd.DataFrame,
63
- task_config: Dict[str, Any],
64
- transform_config: TextSplitterSchema,
65
- execution_trace_log: Optional[Dict[str, Any]],
66
- ) -> pd.DataFrame:
67
- """
68
- Internal function to split and tokenize text in a ledger DataFrame.
69
-
70
- This function extracts text from documents that match a filter criteria based on source types,
71
- splits the text into chunks using the specified tokenizer, and rebuilds document records with the
72
- split text. The resulting DataFrame contains both split and unsplit documents.
73
-
74
- Parameters
75
- ----------
76
- df_transform_ledger : pd.DataFrame
77
- DataFrame containing documents to be processed. Expected to have columns 'document_type' and
78
- 'metadata', where 'metadata' includes a 'content' field and nested source information.
79
- task_config : dict
80
- Dictionary with task-specific configuration. Expected keys include:
81
- - "tokenizer": Tokenizer identifier or path.
82
- - "chunk_size": Maximum number of tokens per chunk.
83
- - "chunk_overlap": Number of tokens to overlap between chunks.
84
- - "params": A sub-dictionary that may contain:
85
- - "hf_access_token": Hugging Face access token.
86
- - "split_source_types": List of source types to filter for splitting.
87
- transform_config : TextSplitterSchema
88
- Configuration object providing default values for text splitting parameters.
89
- execution_trace_log : Optional[dict]
90
- Optional dictionary for logging execution trace information; may be None.
91
-
92
- Returns
93
- -------
94
- pd.DataFrame
95
- DataFrame with processed documents. Documents with text matching the filter are split into chunks,
96
- and then merged with those that do not match the filter.
97
-
98
- Raises
99
- ------
100
- ValueError
101
- If the text splitting or tokenization process fails.
102
- """
103
- _ = execution_trace_log # Placeholder for potential execution trace logging.
104
-
105
- # Override parameters using task_config, with fallback to transform_config.
106
- tokenizer_identifier: Optional[str] = task_config.get("tokenizer", transform_config.tokenizer)
107
- chunk_size: int = task_config.get("chunk_size", transform_config.chunk_size)
108
- chunk_overlap: int = task_config.get("chunk_overlap", transform_config.chunk_overlap)
109
- params: Dict[str, Any] = task_config.get("params", {})
110
-
111
- hf_access_token: Optional[str] = params.get("hf_access_token", None)
112
- split_source_types: List[str] = params.get("split_source_types", ["text"])
113
-
114
- logger.debug(
115
- f"Splitting text with tokenizer: {tokenizer_identifier}, "
116
- f"chunk_size: {chunk_size} tokens, "
117
- f"chunk_overlap: {chunk_overlap}"
118
- )
119
-
120
- # Filter to documents with text content.
121
- bool_index = (df_transform_ledger["document_type"] == ContentTypeEnum.TEXT) & (
122
- pd.json_normalize(df_transform_ledger["metadata"])["source_metadata.source_type"].isin(split_source_types)
123
- )
124
- df_filtered: pd.DataFrame = df_transform_ledger.loc[bool_index]
125
-
126
- if df_filtered.empty:
127
- return df_transform_ledger
128
-
129
- model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
130
-
131
- if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
132
- tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
133
- ):
134
- tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
135
- elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
136
- tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
137
- ):
138
- tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
139
-
140
- tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
141
-
142
- split_docs: List[Dict[str, Any]] = []
143
- for _, row in df_filtered.iterrows():
144
- content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
145
- chunks: List[str] = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
146
- split_docs.extend(_build_split_documents(row, chunks))
147
-
148
- split_docs_df: pd.DataFrame = pd.DataFrame(split_docs)
149
-
150
- # Merge split documents with unsplit documents.
151
- merged_df: pd.DataFrame = pd.concat([split_docs_df, df_transform_ledger[~bool_index]], axis=0).reset_index(
152
- drop=True
153
- )
154
-
155
- result, execution_trace_log = merged_df, {}
156
-
157
- return result
File without changes
File without changes
@@ -1,47 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
6
-
7
-
8
- def cm_ensure_payload_not_null(control_message: IngestControlMessage):
9
- """
10
- Ensures that the payload of a IngestControlMessage is not None.
11
-
12
- Parameters
13
- ----------
14
- control_message : IngestControlMessage
15
- The IngestControlMessage to check.
16
-
17
- Raises
18
- ------
19
- ValueError
20
- If the payload is None.
21
- """
22
-
23
- if control_message.payload() is None:
24
- raise ValueError("Payload cannot be None")
25
-
26
-
27
- def cm_set_failure(control_message: IngestControlMessage, reason: str) -> IngestControlMessage:
28
- """
29
- Sets the failure metadata on a IngestControlMessage.
30
-
31
- Parameters
32
- ----------
33
- control_message : IngestControlMessage
34
- The IngestControlMessage to set the failure metadata on.
35
- reason : str
36
- The reason for the failure.
37
-
38
- Returns
39
- -------
40
- control_message : IngestControlMessage
41
- The modified IngestControlMessage with the failure metadata set.
42
- """
43
-
44
- control_message.set_metadata("cm_failed", True)
45
- control_message.set_metadata("cm_failed_reason", reason)
46
-
47
- return control_message
File without changes
@@ -1,78 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import base64
7
-
8
-
9
- def bytesfromhex(hex_input):
10
- """
11
- Function to convert hex to bytes.
12
-
13
- Parameters
14
- ----------
15
- hex_input : hex
16
- Hex string to store bytes in cuDF.
17
-
18
- Returns
19
- -------
20
- bytes
21
- Hex encoded object converted to bytes.
22
- """
23
-
24
- return bytes.fromhex(hex_input)
25
-
26
-
27
- def hexfrombytes(bytes_input):
28
- """
29
- Function to bytes to hex string.
30
-
31
- Parameters
32
- ----------
33
- bytes_input : bytes
34
- Raw bytes of object.
35
-
36
- Returns
37
- -------
38
- hex
39
- Hex string to store bytes in cuDF.
40
- """
41
-
42
- return bytes_input.hex()
43
-
44
-
45
- def bytesfrombase64(base64_input):
46
- """
47
- Function to convert base64 encoded string to bytes.
48
-
49
- Parameters
50
- ----------
51
- base64_input : hex
52
- Base64 encoded string to store bytes in cuDF.
53
-
54
- Returns
55
- -------
56
- bytes
57
- Base64 encoded string converted to bytes.
58
- """
59
-
60
- return base64.b64decode(base64_input)
61
-
62
-
63
- def base64frombytes(bytes_input, encoding="utf-8"):
64
- """
65
- Function to bytes to base64 string.
66
-
67
- Parameters
68
- ----------
69
- bytes_input : bytes
70
- Raw bytes of object.
71
-
72
- Returns
73
- -------
74
- base64
75
- base64 encoded string to store bytes in cuDF.
76
- """
77
-
78
- return base64.b64encode(bytes_input).decode(encoding)
@@ -1,65 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from typing import Any
8
- from typing import Dict
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- def merge_dict(defaults: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]:
14
- """
15
- Recursively merges two dictionaries, with values from the `overrides` dictionary taking precedence.
16
-
17
- This function merges the `overrides` dictionary into the `defaults` dictionary. If a key in both dictionaries
18
- has a dictionary as its value, the function will recursively merge those dictionaries. Otherwise, the value
19
- from the `overrides` dictionary will overwrite the value in the `defaults` dictionary.
20
-
21
- Parameters
22
- ----------
23
- defaults : dict of {str: Any}
24
- The default dictionary that will be updated with values from the `overrides` dictionary.
25
- overrides : dict of {str: Any}
26
- The dictionary containing values that will override or extend those in the `defaults` dictionary.
27
-
28
- Returns
29
- -------
30
- dict of {str: Any}
31
- The merged dictionary, with values from the `overrides` dictionary taking precedence.
32
-
33
- Examples
34
- --------
35
- >>> defaults = {
36
- ... "a": 1,
37
- ... "b": {
38
- ... "c": 3,
39
- ... "d": 4
40
- ... },
41
- ... "e": 5
42
- ... }
43
- >>> overrides = {
44
- ... "b": {
45
- ... "c": 30
46
- ... },
47
- ... "f": 6
48
- ... }
49
- >>> result = merge_dict(defaults, overrides)
50
- >>> result
51
- {'a': 1, 'b': {'c': 30, 'd': 4}, 'e': 5, 'f': 6}
52
-
53
- Notes
54
- -----
55
- - The `merge_dict` function modifies the `defaults` dictionary in place. If you need to preserve the original
56
- `defaults` dictionary, consider passing a copy instead.
57
- - This function is particularly useful when combining configuration dictionaries where certain settings should
58
- override defaults.
59
- """
60
- for key, value in overrides.items():
61
- if isinstance(value, dict) and value:
62
- defaults[key] = merge_dict(defaults.get(key, {}), value)
63
- else:
64
- defaults[key] = overrides[key]
65
- return defaults
@@ -1,90 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- from datetime import datetime
7
- from datetime import timezone
8
-
9
- from dateutil.parser import parse
10
-
11
- from nv_ingest_api.util.exception_handlers.converters import datetools_exception_handler
12
-
13
-
14
- @datetools_exception_handler
15
- def datetimefrompdfmeta(pdf_formated_date: str, keep_tz: bool = False) -> str:
16
- """
17
- Convert PDF metadata formatted date string to a datetime object.
18
-
19
- Parameters
20
- ----------
21
- pdf_formated_date : str
22
- A date string in standard PDF metadata format.
23
- Example: `str("D:20211222141131-07'00'")`
24
- keep_tz : bool, optional
25
- Keep or remove the timezone attribute of the parsed datetime object. If `False` (necessary for arrow format),
26
- the timezone offset will be added to the datetime. Parsed datetimes will be in the same local time.
27
-
28
- Returns
29
- -------
30
- str
31
- A datetime object parsed from the input date string in ISO 8601 format.
32
-
33
- """
34
-
35
- try:
36
- # standard pdf date format
37
- pattern = "D:%Y%m%d%H%M%S%z"
38
- # clean up date string
39
- cleaned_date_string = pdf_formated_date[:-1].replace("'", ":")
40
- parsed_dt_tz = datetime.strptime(cleaned_date_string, pattern)
41
- except ValueError:
42
- parsed_dt_tz = parse(pdf_formated_date, fuzzy=True)
43
-
44
- if not keep_tz:
45
- return remove_tz(parsed_dt_tz).isoformat()
46
-
47
- return parsed_dt_tz.isoformat()
48
-
49
-
50
- def remove_tz(datetime_obj: datetime) -> datetime:
51
- """
52
- Remove timezone and add offset to a datetime object.
53
-
54
- Parameters
55
- ----------
56
- datetime_obj : datetime.datetime
57
- A datetime object with or without the timezone attribute set.
58
-
59
- Returns
60
- -------
61
- datetime.datetime
62
- A datetime object with the timezone offset added and the timezone attribute removed.
63
-
64
- """
65
-
66
- if datetime_obj.tzinfo is not None: # If timezone info is present
67
- # Convert to UTC
68
- datetime_obj = datetime_obj.astimezone(timezone.utc)
69
- # Remove timezone information
70
- datetime_obj = datetime_obj.replace(tzinfo=None)
71
-
72
- return datetime_obj
73
-
74
-
75
- def validate_iso8601(date_string: str) -> None:
76
- """
77
- Verify that the given date string is in ISO 8601 format.
78
-
79
- Parameters
80
- ----------
81
- date_string : str
82
- A date string in human-readable format, ideally ISO 8601.
83
-
84
- Raises
85
- ------
86
- ValueError
87
- If the date string is not in a valid ISO 8601 format.
88
- """
89
-
90
- assert datetime.fromisoformat(date_string)
@@ -1,127 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import io
6
- import json
7
-
8
- import fastparquet
9
- import pandas as pd
10
-
11
- import cudf
12
-
13
-
14
- class MemoryFiles:
15
- def __init__(self):
16
- self.output = {}
17
-
18
- def open(self, fn, mode="rb"):
19
- if mode != "wb":
20
- try:
21
- self.output[fn].seek(0)
22
- except KeyError:
23
- raise FileNotFoundError
24
- return self.output[fn]
25
-
26
- i = io.BytesIO()
27
- self.output[fn] = i
28
- self.output[fn].close = lambda: None
29
-
30
- return i
31
-
32
-
33
- def pandas_to_cudf(
34
- df: pd.DataFrame,
35
- deserialize_cols: list = [],
36
- default_cols: dict = {"document_type": str, "metadata": str},
37
- default_type: type = str,
38
- ) -> cudf.DataFrame:
39
- """
40
- Helper function to convert from pandas to cudf until https://github.com/apache/arrow/pull/40412 is resolved.
41
-
42
- Parameters
43
- ----------
44
- df : pd.DataFrame
45
- A pandas dataframe.
46
- Returns
47
- -------
48
- cudf.DataFrame
49
- A cuDF dataframe.
50
- """
51
-
52
- if not df.empty:
53
- files = MemoryFiles()
54
- for col in deserialize_cols:
55
- df[col] = df[col].apply(lambda x: json.loads(x))
56
- df = pd.concat([df, df.iloc[0:1]], axis=0)
57
-
58
- fastparquet.write("_", df, open_with=files.open, compression="UNCOMPRESSED", object_encoding="json")
59
-
60
- with files.output["_"] as bytes_buf:
61
- gdf = cudf.read_parquet(bytes_buf).iloc[:-1]
62
- gdf.index.name = None
63
-
64
- return gdf
65
- else:
66
- gdf = cudf.DataFrame({col: [] for col in default_cols})
67
- for col in df.columns:
68
- field_type = default_cols.get(col, default_type)
69
- gdf[col] = gdf[col].astype(field_type)
70
-
71
- return gdf
72
-
73
-
74
- def cudf_to_pandas(gdf: cudf.DataFrame, deserialize_cols: list = []) -> pd.DataFrame:
75
- """
76
- Helper function to convert from cudf to pandas until https://github.com/apache/arrow/pull/40412 is resolved.
77
-
78
- Parameters
79
- ----------
80
- gdf : cudf.DataFrame
81
- A cuDF dataframe.
82
- nested_cols : list
83
- A list of columns containing nested data.
84
- Returns
85
- -------
86
- pd.DataFrame
87
- A pandas dataframe.
88
- """
89
-
90
- with io.BytesIO() as bytes_buf:
91
- gdf.to_parquet(bytes_buf)
92
- df = pd.read_parquet(bytes_buf, engine="fastparquet", index=None)
93
-
94
- for col in deserialize_cols:
95
- if col in df.columns:
96
- df[col] = df[col].apply(lambda x: json.loads(x))
97
-
98
- return df
99
-
100
-
101
- def cudf_to_json(gdf: cudf.DataFrame, deserialize_cols: list = []) -> str:
102
- """
103
- Helper function to convert from cudf to json until https://github.com/apache/arrow/pull/40412 is resolved.
104
-
105
- Parameters
106
- ----------
107
- gdf : cudf.DataFrame
108
- A cuDF dataframe.
109
- nested_cols : list
110
- A list of columns containing nested data.
111
- Returns
112
- -------
113
- str
114
- A JSON formated string.
115
- """
116
-
117
- records = []
118
- dict_vals = cudf_to_pandas(gdf).to_dict(orient="records")
119
- for d in dict_vals:
120
- temp = {}
121
- for key, val in d.items():
122
- if key in deserialize_cols:
123
- val = json.loads(val)
124
- temp[key] = val
125
- records.append(temp)
126
-
127
- return records
@@ -1,64 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # pylint: skip-file
6
-
7
- import json
8
-
9
-
10
- def ingest_json_results_to_blob(result_content):
11
- """
12
- Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
13
-
14
- Returns:
15
- str: The generated blob string.
16
- """
17
- try:
18
- # Load the JSON data
19
- data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)
20
- data = data["data"]
21
-
22
- # Smarter sorting: by page, then structured objects by x0, y0
23
- def sorting_key(entry):
24
- page = entry["metadata"]["content_metadata"]["page_number"]
25
- if entry["document_type"] == "structured":
26
- # Use table location's x0 and y0 as secondary keys
27
- x0 = entry["metadata"]["table_metadata"]["table_location"][0]
28
- y0 = entry["metadata"]["table_metadata"]["table_location"][1]
29
- else:
30
- # Non-structured objects are sorted after structured ones
31
- x0 = float("inf")
32
- y0 = float("inf")
33
- return page, x0, y0
34
-
35
- data.sort(key=sorting_key)
36
-
37
- # Initialize the blob string
38
- blob = []
39
-
40
- for entry in data:
41
- document_type = entry.get("document_type", "")
42
-
43
- if document_type == "structured":
44
- # Add table content to the blob
45
- blob.append(entry["metadata"]["table_metadata"]["table_content"])
46
- blob.append("\n")
47
-
48
- elif document_type == "text":
49
- # Add content to the blob
50
- blob.append(entry["metadata"]["content"])
51
- blob.append("\n")
52
-
53
- elif document_type == "image":
54
- # Add image caption to the blob
55
- caption = entry["metadata"]["image_metadata"].get("caption", "")
56
- blob.append(f"image_caption:[{caption}]")
57
- blob.append("\n")
58
-
59
- # Join all parts of the blob into a single string
60
- return "".join(blob)
61
-
62
- except Exception as e:
63
- print(f"[ERROR] An error occurred while processing JSON content: {e}")
64
- return ""
@@ -1,27 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
- from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
5
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
6
-
7
- DOC_TO_CONTENT_MAP = {
8
- DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
9
- DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
10
- DocumentTypeEnum.HTML: ContentTypeEnum.STRUCTURED,
11
- DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
12
- DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
13
- DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
14
- DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
15
- DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
16
- DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
17
- DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
18
- DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
19
- DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
20
- }
21
-
22
-
23
- def doc_type_to_content_type(doc_type: DocumentTypeEnum) -> ContentTypeEnum:
24
- """
25
- Convert DocumentTypeEnum to ContentTypeEnum
26
- """
27
- return DOC_TO_CONTENT_MAP[doc_type]
@@ -1,5 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # Copyright (c) 2024, NVIDIA CORPORATION.