nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,126 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any
7
+ from typing import Dict
8
+ from typing import Optional
9
+ from typing import Tuple
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ModelInterface:
16
+ """
17
+ Base class for defining a model interface that supports preparing input data, formatting it for
18
+ inference, parsing output, and processing inference results.
19
+ """
20
+
21
+ def format_input(self, data: dict, protocol: str, max_batch_size: int):
22
+ """
23
+ Format the input data for the specified protocol.
24
+
25
+ Parameters
26
+ ----------
27
+ data : dict
28
+ The input data to format.
29
+ protocol : str
30
+ The protocol to format the data for.
31
+ """
32
+
33
+ raise NotImplementedError("Subclasses should implement this method")
34
+
35
+ def parse_output(self, response, protocol: str, data: Optional[dict] = None, **kwargs):
36
+ """
37
+ Parse the output data from the model's inference response.
38
+
39
+ Parameters
40
+ ----------
41
+ response : Any
42
+ The response from the model inference.
43
+ protocol : str
44
+ The protocol used ("grpc" or "http").
45
+ data : dict, optional
46
+ Additional input data passed to the function.
47
+ """
48
+
49
+ raise NotImplementedError("Subclasses should implement this method")
50
+
51
+ def prepare_data_for_inference(self, data: dict):
52
+ """
53
+ Prepare input data for inference by processing or transforming it as required.
54
+
55
+ Parameters
56
+ ----------
57
+ data : dict
58
+ The input data to prepare.
59
+ """
60
+ raise NotImplementedError("Subclasses should implement this method")
61
+
62
+ def process_inference_results(self, output_array, protocol: str, **kwargs):
63
+ """
64
+ Process the inference results from the model.
65
+
66
+ Parameters
67
+ ----------
68
+ output_array : Any
69
+ The raw output from the model.
70
+ kwargs : dict
71
+ Additional parameters for processing.
72
+ """
73
+ raise NotImplementedError("Subclasses should implement this method")
74
+
75
+ def name(self) -> str:
76
+ """
77
+ Get the name of the model interface.
78
+
79
+ Returns
80
+ -------
81
+ str
82
+ The name of the model interface.
83
+ """
84
+ raise NotImplementedError("Subclasses should implement this method")
85
+
86
+ def coalesce_requests_to_batch(self, requests, protocol: str, **kwargs) -> Tuple[Any, Dict[str, Any]]:
87
+ """
88
+ Takes a list of InferenceRequest objects and combines them into a single
89
+ formatted batch ready for inference.
90
+
91
+ THIS METHOD IS REQUIRED FOR DYNAMIC BATCHING SUPPORT.
92
+
93
+ Parameters
94
+ ----------
95
+ requests : List[InferenceRequest]
96
+ A list of InferenceRequest namedtuples collected for the batch.
97
+ Each tuple contains the data, dimensions, and other context for a single item.
98
+ protocol : str
99
+ The inference protocol, either "grpc" or "http".
100
+ **kwargs : Any
101
+ Additional keyword arguments passed from the original request.
102
+
103
+ Returns
104
+ -------
105
+ Tuple[Any, Dict[str, Any]]
106
+ A tuple containing the single formatted batch and its scratch-pad data.
107
+ """
108
+ raise NotImplementedError(
109
+ f"{self.__class__.__name__} does not support dynamic batching "
110
+ "because `coalesce_requests_to_batch` is not implemented."
111
+ )
112
+
113
+ def does_item_fit_in_batch(self, current_batch, next_request, memory_budget_bytes: int) -> bool:
114
+ """
115
+ Checks if adding another request to the current batch would exceed the memory budget.
116
+
117
+ This is a model-specific calculation. The default implementation always
118
+ returns True, effectively ignoring the memory budget. Interfaces for models
119
+ that require memory management (like padded image models) must override this.
120
+
121
+ Returns
122
+ -------
123
+ bool
124
+ True if the item fits within the budget, False otherwise.
125
+ """
126
+ return True
File without changes
@@ -0,0 +1,69 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from datetime import datetime
8
+ from functools import wraps
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ # Define ANSI color codes
14
+ class ColorCodes:
15
+ RED = "\033[91m"
16
+ GREEN = "\033[92m"
17
+ YELLOW = "\033[93m"
18
+ BLUE = "\033[94m" # Added Blue
19
+ RESET = "\033[0m"
20
+
21
+
22
+ # Function to apply color to a message
23
+ def colorize(message, color_code):
24
+ return f"{color_code}{message}{ColorCodes.RESET}"
25
+
26
+
27
+ def latency_logger(name=None):
28
+ """
29
+ A decorator to log the elapsed time of function execution. If available, it also logs
30
+ the latency based on 'latency::ts_send' metadata in a IngestControlMessage object.
31
+
32
+ Parameters
33
+ ----------
34
+ name : str, optional
35
+ Custom name to use in the log message. Defaults to the function's name.
36
+ """
37
+
38
+ def decorator(func):
39
+ @wraps(func)
40
+ def wrapper(*args, **kwargs):
41
+ # Ensure there's at least one argument and it has timestamp handling capabilities
42
+ if args and hasattr(args[0], "get_timestamp"):
43
+ message = args[0]
44
+ start_time = datetime.now()
45
+
46
+ result = func(*args, **kwargs)
47
+
48
+ end_time = datetime.now()
49
+ elapsed_time = end_time - start_time
50
+
51
+ func_name = name if name else func.__name__
52
+
53
+ # Log latency from ts_send if available
54
+ if message.filter_timestamp("latency::ts_send"):
55
+ ts_send = message.get_timestamp("latency::ts_send")
56
+ latency_ms = (start_time - ts_send).total_seconds() * 1e3
57
+ logger.debug(f"{func_name} since ts_send: {latency_ms} msec.")
58
+
59
+ message.set_timestamp("latency::ts_send", datetime.now())
60
+ message.set_timestamp(f"latency::{func_name}::elapsed_time", elapsed_time)
61
+ return result
62
+ else:
63
+ raise ValueError(
64
+ "The first argument must be a IngestControlMessage object with metadata " "capabilities."
65
+ )
66
+
67
+ return wrapper
68
+
69
+ return decorator
@@ -0,0 +1,96 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import inspect
7
+ import uuid
8
+ from datetime import datetime
9
+ from enum import Enum
10
+
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
12
+
13
+
14
+ class TaskResultStatus(Enum):
15
+ SUCCESS = "SUCCESS"
16
+ FAILURE = "FAILURE"
17
+
18
+
19
+ def annotate_cm(control_message: IngestControlMessage, source_id=None, **kwargs):
20
+ """
21
+ Annotate a IngestControlMessage object with arbitrary metadata, a source ID, and a timestamp.
22
+ Each annotation will be uniquely identified by a UUID.
23
+
24
+ Parameters:
25
+ - control_message: The IngestControlMessage object to be annotated.
26
+ - source_id: A unique identifier for the source of the annotation. If None, uses the caller's __name__.
27
+ - **kwargs: Arbitrary key-value pairs to be included in the annotation.
28
+ """
29
+ if source_id is None:
30
+ # Determine the __name__ of the parent caller's module
31
+ frame = inspect.currentframe()
32
+ caller_frame = inspect.getouterframes(frame)[2]
33
+ module = inspect.getmodule(caller_frame[0])
34
+ source_id = module.__name__ if module is not None else "UnknownModule"
35
+
36
+ # Ensure 'annotation_timestamp' is not overridden by kwargs
37
+ if "annotation_timestamp" in kwargs:
38
+ raise ValueError("'annotation_timestamp' is a reserved key and cannot be specified.")
39
+
40
+ message = kwargs.get("message")
41
+ annotation_key = f"annotation::{message}" if message else f"annotation::{uuid.uuid4()}"
42
+
43
+ annotation_timestamp = datetime.now()
44
+ try:
45
+ control_message.set_timestamp(annotation_key, annotation_timestamp)
46
+ except Exception as e:
47
+ print(f"Failed to set annotation timestamp: {e}")
48
+
49
+ # Construct the metadata key uniquely identified by a UUID.
50
+ metadata_key = f"annotation::{uuid.uuid4()}"
51
+
52
+ # Construct the metadata value with reserved 'annotation_timestamp', source_id, and any provided kwargs.
53
+ metadata_value = {
54
+ "source_id": source_id,
55
+ }
56
+ metadata_value.update(kwargs)
57
+
58
+ try:
59
+ # Attempt to set the annotated metadata on the IngestControlMessage object.
60
+ control_message.set_metadata(metadata_key, metadata_value)
61
+ except Exception as e:
62
+ # Handle any exceptions that occur when setting metadata.
63
+ print(f"Failed to annotate IngestControlMessage: {e}")
64
+
65
+
66
+ def annotate_task_result(control_message, result, task_id, source_id=None, **kwargs):
67
+ """
68
+ Annotate a IngestControlMessage object with the result of a task, identified by a task_id,
69
+ and an arbitrary number of additional key-value pairs. The result can be a TaskResultStatus
70
+ enum or a string that will be converted to the corresponding enum.
71
+
72
+ Parameters:
73
+ - control_message: The IngestControlMessage object to be annotated.
74
+ - result: The result of the task, either SUCCESS or FAILURE, as an enum or string.
75
+ - task_id: A unique identifier for the task.
76
+ - **kwargs: Arbitrary additional key-value pairs to be included in the annotation.
77
+ """
78
+ # Convert result to TaskResultStatus enum if it's a string
79
+ if isinstance(result, str):
80
+ try:
81
+ result = TaskResultStatus[result.upper()]
82
+ except KeyError:
83
+ raise ValueError(
84
+ f"Invalid result string: {result}. Must be one of {[status.name for status in TaskResultStatus]}."
85
+ )
86
+ elif not isinstance(result, TaskResultStatus):
87
+ raise ValueError("result must be an instance of TaskResultStatus Enum or a valid result string.")
88
+
89
+ # Annotate the control message with task-related information, including the result and task_id.
90
+ annotate_cm(
91
+ control_message,
92
+ source_id=source_id,
93
+ task_result=result.value,
94
+ task_id=task_id,
95
+ **kwargs,
96
+ )
@@ -0,0 +1,288 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import functools
7
+ import inspect
8
+ import logging
9
+ import string
10
+ from datetime import datetime
11
+ from typing import Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def traceable(trace_name: Optional[str] = None):
17
+ """
18
+ A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
19
+ based on the presence of a 'config::add_trace_tagging' flag.
20
+
21
+ This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
22
+ message's metadata. If so, it records the entry and exit timestamps of the function
23
+ execution, using either a provided custom trace name, auto-detected stage name from
24
+ self.stage_name, or the function's name as fallback.
25
+
26
+ Parameters
27
+ ----------
28
+ trace_name : str, optional
29
+ A custom name for the trace entries in the message metadata. If not provided,
30
+ attempts to use self.stage_name from the decorated method's instance,
31
+ falling back to the function's name if neither is available.
32
+
33
+ Returns
34
+ -------
35
+ decorator_trace_tagging : Callable
36
+ A wrapper function that decorates the target function to implement trace tagging.
37
+
38
+ Notes
39
+ -----
40
+ The decorated function must accept a IngestControlMessage object as one of its arguments.
41
+ For a regular function, this is expected to be the first argument; for a class method,
42
+ this is expected to be the second argument (after 'self'). The IngestControlMessage object
43
+ must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
44
+ to check for the trace tagging flag and to add trace metadata.
45
+
46
+ The trace metadata added by the decorator includes two entries:
47
+ - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
48
+ - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
49
+
50
+ Examples
51
+ --------
52
+ Automatic stage name detection (recommended):
53
+
54
+ >>> @traceable() # Uses self.stage_name automatically
55
+ ... def process_message(self, message):
56
+ ... pass
57
+
58
+ Explicit trace name (override):
59
+
60
+ >>> @traceable("custom_trace")
61
+ ... def process_message(self, message):
62
+ ... pass
63
+
64
+ Function without instance (uses function name):
65
+
66
+ >>> @traceable()
67
+ ... def process_message(message):
68
+ ... pass
69
+ """
70
+
71
+ def decorator_trace_tagging(func):
72
+ @functools.wraps(func)
73
+ def wrapper_trace_tagging(*args, **kwargs):
74
+ ts_fetched = datetime.now()
75
+
76
+ # Determine the trace name to use
77
+ resolved_trace_name = trace_name
78
+
79
+ # If no explicit trace_name provided, try to get it from self.stage_name
80
+ if resolved_trace_name is None and len(args) >= 1:
81
+ stage_instance = args[0] # 'self' in method calls
82
+ if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
83
+ resolved_trace_name = stage_instance.stage_name
84
+ logger.debug(f"Using auto-detected trace name: '{resolved_trace_name}'")
85
+ else:
86
+ resolved_trace_name = func.__name__
87
+ logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
88
+ elif resolved_trace_name is None:
89
+ resolved_trace_name = func.__name__
90
+ logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
91
+
92
+ # Determine which argument is the message.
93
+ if hasattr(args[0], "has_metadata"):
94
+ message = args[0]
95
+ elif len(args) > 1 and hasattr(args[1], "has_metadata"):
96
+ message = args[1]
97
+ else:
98
+ raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
99
+
100
+ do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
101
+ message.get_metadata("config::add_trace_tagging") is True
102
+ )
103
+
104
+ trace_prefix = resolved_trace_name
105
+
106
+ if do_trace_tagging:
107
+ ts_send = message.get_timestamp("latency::ts_send")
108
+ ts_entry = datetime.now()
109
+ message.set_timestamp(f"trace::entry::{trace_prefix}", ts_entry)
110
+ if ts_send:
111
+ message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
112
+ message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
113
+
114
+ # Call the decorated function.
115
+ result = func(*args, **kwargs)
116
+
117
+ if do_trace_tagging:
118
+ ts_exit = datetime.now()
119
+ message.set_timestamp(f"trace::exit::{trace_prefix}", ts_exit)
120
+ message.set_timestamp("latency::ts_send", ts_exit)
121
+
122
+ return result
123
+
124
+ return wrapper_trace_tagging
125
+
126
+ return decorator_trace_tagging
127
+
128
+
129
+ def traceable_func(trace_name=None, dedupe=True):
130
+ """
131
+ A decorator that injects trace information for tracking the execution of a function.
132
+ It logs the entry and exit timestamps of the function in a `trace_info` dictionary,
133
+ which can be used for performance monitoring or debugging purposes.
134
+
135
+ Parameters
136
+ ----------
137
+ trace_name : str, optional
138
+ An optional string used as the prefix for the trace log entries. If not provided,
139
+ the decorated function's name is used. The string can include placeholders (e.g.,
140
+ "pdf_extractor::{model_name}") that will be dynamically replaced with matching
141
+ function argument values.
142
+ dedupe : bool, optional
143
+ If True, ensures that the trace entry and exit keys are unique by appending an index
144
+ (e.g., `_0`, `_1`) to the keys if duplicate entries are detected. Default is True.
145
+
146
+ Returns
147
+ -------
148
+ function
149
+ A wrapped function that injects trace information before and after the function's
150
+ execution.
151
+
152
+ Notes
153
+ -----
154
+ - If `trace_info` is not provided in the keyword arguments, a new dictionary is created
155
+ and used for storing trace entries.
156
+ - If `trace_name` contains format placeholders, the decorator attempts to populate them
157
+ with matching argument values from the decorated function.
158
+ - The trace information is logged in the format:
159
+ - `trace::entry::{trace_name}` for the entry timestamp.
160
+ - `trace::exit::{trace_name}` for the exit timestamp.
161
+ - If `dedupe` is True, the trace keys will be appended with an index to avoid
162
+ overwriting existing entries.
163
+
164
+ Example
165
+ -------
166
+ >>> @traceable_func(trace_name="pdf_extractor::{model_name}")
167
+ >>> def extract_pdf(model_name):
168
+ ... pass
169
+ >>> trace_info = {}
170
+ >>> extract_pdf("my_model", trace_info=trace_info)
171
+
172
+ In this example, `model_name` is dynamically replaced in the trace_name, and the
173
+ trace information is logged with unique keys if deduplication is enabled.
174
+ """
175
+
176
+ def decorator_inject_trace_info(func):
177
+ @functools.wraps(func)
178
+ def wrapper_inject_trace_info(*args, **kwargs):
179
+ trace_info = kwargs.pop("trace_info", None)
180
+ if trace_info is None:
181
+ trace_info = {}
182
+ trace_prefix = trace_name if trace_name else func.__name__
183
+
184
+ arg_names = list(inspect.signature(func).parameters)
185
+ args_name_to_val = dict(zip(arg_names, args))
186
+
187
+ # If `trace_name` is a formattable string, e.g., "pdf_extractor::{model_name}",
188
+ # search `args` and `kwargs` to replace the placeholder.
189
+ placeholders = [x[1] for x in string.Formatter().parse(trace_name) if x[1] is not None]
190
+ if placeholders:
191
+ format_kwargs = {}
192
+ for name in placeholders:
193
+ if name in args_name_to_val:
194
+ arg_val = args_name_to_val[name]
195
+ elif name in kwargs:
196
+ arg_val = kwargs.get(name)
197
+ else:
198
+ arg_val = name
199
+ format_kwargs[name] = arg_val
200
+ trace_prefix = trace_prefix.format(**format_kwargs)
201
+
202
+ trace_entry_key = f"trace::entry::{trace_prefix}"
203
+ trace_exit_key = f"trace::exit::{trace_prefix}"
204
+
205
+ ts_entry = datetime.now()
206
+
207
+ if dedupe:
208
+ trace_entry_key += "_{}"
209
+ trace_exit_key += "_{}"
210
+ i = 0
211
+ while (trace_entry_key.format(i) in trace_info) or (trace_exit_key.format(i) in trace_info):
212
+ i += 1
213
+ trace_entry_key = trace_entry_key.format(i)
214
+ trace_exit_key = trace_exit_key.format(i)
215
+
216
+ trace_info[trace_entry_key] = ts_entry
217
+
218
+ # Call the decorated function
219
+ result = func(*args, **kwargs)
220
+
221
+ ts_exit = datetime.now()
222
+
223
+ trace_info[trace_exit_key] = ts_exit
224
+
225
+ return result
226
+
227
+ return wrapper_inject_trace_info
228
+
229
+ return decorator_inject_trace_info
230
+
231
+
232
+ def set_trace_timestamps_with_parent_context(control_message, execution_trace_log: dict, parent_name: str, logger=None):
233
+ """
234
+ Set trace timestamps on a control message with proper parent-child context.
235
+
236
+ This utility function processes trace timestamps from an execution_trace_log and
237
+ ensures that child traces are properly namespaced under their parent context.
238
+ This resolves OpenTelemetry span hierarchy issues where child spans cannot
239
+ find their expected parent contexts.
240
+
241
+ Parameters
242
+ ----------
243
+ control_message : IngestControlMessage
244
+ The control message to set timestamps on
245
+ execution_trace_log : dict
246
+ Dictionary of trace keys to timestamp values from internal operations
247
+ parent_name : str
248
+ The parent stage name to use as context for child traces
249
+ logger : logging.Logger, optional
250
+ Logger for debug output of key transformations
251
+
252
+ Examples
253
+ --------
254
+ Basic usage in a stage:
255
+
256
+ >>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2}
257
+ >>> set_trace_timestamps_with_parent_context(
258
+ ... control_message, execution_trace_log, "pdf_extractor", logger
259
+ ... )
260
+
261
+ This transforms:
262
+ - trace::entry::yolox_inference -> trace::entry::pdf_extractor::yolox_inference
263
+ - trace::exit::yolox_inference -> trace::exit::pdf_extractor::yolox_inference
264
+ """
265
+ if not execution_trace_log:
266
+ return
267
+
268
+ for key, ts in execution_trace_log.items():
269
+ enhanced_key = key
270
+
271
+ # Check if this is a child trace that needs parent context
272
+ if key.startswith("trace::") and "::" in key:
273
+ # Parse the trace key to extract the base trace name
274
+ parts = key.split("::")
275
+ if len(parts) >= 3: # e.g., ["trace", "entry", "yolox_inference"]
276
+ trace_type = parts[1] # "entry" or "exit"
277
+ child_name = "::".join(parts[2:]) # everything after trace::entry:: or trace::exit::
278
+
279
+ # Only rewrite if it doesn't already include the parent context
280
+ if not child_name.startswith(f"{parent_name}::"):
281
+ # Rewrite to include parent context: trace::entry::pdf_extractor::yolox_inference
282
+ enhanced_key = f"trace::{trace_type}::{parent_name}::{child_name}"
283
+
284
+ if logger:
285
+ logger.debug(f"Enhanced trace key: {key} -> {enhanced_key}")
286
+
287
+ # Set the timestamp with the (possibly enhanced) key
288
+ control_message.set_timestamp(enhanced_key, ts)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0