nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,137 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Optional
7
+ from typing import Tuple
8
+
9
+ from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
10
+
11
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class OCRExtractorConfigSchema(LowercaseProtocolMixin):
17
+ """
18
+ Configuration schema for text extraction service endpoints and options.
19
+
20
+ Parameters
21
+ ----------
22
+ auth_token : Optional[str], default=None
23
+ Authentication token required for secure services.
24
+
25
+ ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
26
+ A tuple containing the gRPC and HTTP services for the ocr endpoint.
27
+ Either the gRPC or HTTP service can be empty, but not both.
28
+
29
+ Methods
30
+ -------
31
+ validate_endpoints(values)
32
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
33
+
34
+ Raises
35
+ ------
36
+ ValueError
37
+ If both gRPC and HTTP services are empty for any endpoint.
38
+
39
+ Config
40
+ ------
41
+ extra : str
42
+ Pydantic config option to forbid extra fields.
43
+ """
44
+
45
+ auth_token: Optional[str] = Field(default=None, repr=False)
46
+
47
+ ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
48
+ ocr_infer_protocol: str = ""
49
+
50
+ nim_batch_size: int = 2
51
+ workers_per_progress_engine: int = 5
52
+
53
+ @model_validator(mode="before")
54
+ @classmethod
55
+ def validate_endpoints(cls, values):
56
+ """
57
+ Validates the gRPC and HTTP services for all endpoints.
58
+
59
+ Ensures that at least one service (either gRPC or HTTP) is provided
60
+ for each endpoint in the configuration.
61
+
62
+ Parameters
63
+ ----------
64
+ values : dict
65
+ Dictionary containing the values of the attributes for the class.
66
+
67
+ Returns
68
+ -------
69
+ dict
70
+ The validated dictionary of values.
71
+
72
+ Raises
73
+ ------
74
+ ValueError
75
+ If both gRPC and HTTP services are empty for any endpoint.
76
+ """
77
+
78
+ def clean_service(service):
79
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
80
+ if service is None or not service.strip() or service.strip(" \"'") == "":
81
+ return None
82
+ return service
83
+
84
+ for endpoint_name in ["ocr_endpoints"]:
85
+ grpc_service, http_service = values.get(endpoint_name, (None, None))
86
+ grpc_service = clean_service(grpc_service)
87
+ http_service = clean_service(http_service)
88
+
89
+ if not grpc_service and not http_service:
90
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
91
+
92
+ values[endpoint_name] = (grpc_service, http_service)
93
+
94
+ # Auto-infer protocol from endpoints if not specified
95
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
96
+ protocol_value = values.get(protocol_name)
97
+ if not protocol_value:
98
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
99
+ values[protocol_name] = protocol_value
100
+
101
+ return values
102
+
103
+ model_config = ConfigDict(extra="forbid")
104
+
105
+
106
+ class OCRExtractorSchema(BaseModel):
107
+ """
108
+ Configuration schema for text extraction processing settings.
109
+
110
+ Parameters
111
+ ----------
112
+ max_queue_size : int, default=1
113
+ The maximum number of items allowed in the processing queue.
114
+
115
+ n_workers : int, default=2
116
+ The number of worker threads to use for processing.
117
+
118
+ raise_on_failure : bool, default=False
119
+ A flag indicating whether to raise an exception if a failure occurs during text extraction.
120
+
121
+ stage_config : Optional[OCRExtractorConfigSchema], default=None
122
+ Configuration for the text extraction stage, including yolox and ocr service endpoints.
123
+ """
124
+
125
+ max_queue_size: int = 1
126
+ n_workers: int = 2
127
+ raise_on_failure: bool = False
128
+
129
+ endpoint_config: Optional[OCRExtractorConfigSchema] = None
130
+
131
+ @field_validator("max_queue_size", "n_workers")
132
+ def check_positive(cls, v, field):
133
+ if v <= 0:
134
+ raise ValueError(f"{field.field_name} must be greater than 0.")
135
+ return v
136
+
137
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,220 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel, Field
11
+
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class PDFiumConfigSchema(LowercaseProtocolMixin):
18
+ """
19
+ Configuration schema for PDFium endpoints and options.
20
+
21
+ Parameters
22
+ ----------
23
+ auth_token : Optional[str], default=None
24
+ Authentication token required for secure services.
25
+
26
+ yolox_endpoints : Tuple[str, str]
27
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
28
+ Either the gRPC or HTTP service can be empty, but not both.
29
+
30
+ Methods
31
+ -------
32
+ validate_endpoints(values)
33
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
34
+
35
+ Raises
36
+ ------
37
+ ValueError
38
+ If both gRPC and HTTP services are empty for any endpoint.
39
+
40
+ Config
41
+ ------
42
+ extra : str
43
+ Pydantic config option to forbid extra fields.
44
+ """
45
+
46
+ auth_token: Optional[str] = Field(default=None, repr=False)
47
+
48
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
49
+ yolox_infer_protocol: str = ""
50
+
51
+ nim_batch_size: int = 4
52
+ workers_per_progress_engine: int = 5
53
+
54
+ @model_validator(mode="before")
55
+ @classmethod
56
+ def validate_endpoints(cls, values):
57
+ """
58
+ Validates the gRPC and HTTP services for all endpoints.
59
+
60
+ Parameters
61
+ ----------
62
+ values : dict
63
+ Dictionary containing the values of the attributes for the class.
64
+
65
+ Returns
66
+ -------
67
+ dict
68
+ The validated dictionary of values.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If both gRPC and HTTP services are empty for any endpoint.
74
+ """
75
+
76
+ for model_name in ["yolox"]:
77
+ endpoint_name = f"{model_name}_endpoints"
78
+ grpc_service, http_service = values.get(endpoint_name, ("", ""))
79
+ grpc_service = _clean_service(grpc_service)
80
+ http_service = _clean_service(http_service)
81
+
82
+ if not grpc_service and not http_service:
83
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
84
+
85
+ values[endpoint_name] = (grpc_service, http_service)
86
+
87
+ # Auto-infer protocol from endpoints if not specified
88
+ protocol_name = f"{model_name}_infer_protocol"
89
+ protocol_value = values.get(protocol_name)
90
+ if not protocol_value:
91
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
+ values[protocol_name] = protocol_value
93
+
94
+ return values
95
+
96
+ model_config = ConfigDict(extra="forbid")
97
+
98
+
99
+ class NemotronParseConfigSchema(LowercaseProtocolMixin):
100
+ """
101
+ Configuration schema for Nemotron Parse endpoints and options.
102
+
103
+ Parameters
104
+ ----------
105
+ auth_token : Optional[str], default=None
106
+ Authentication token required for secure services.
107
+
108
+ nemotron_parse_endpoints : Tuple[str, str]
109
+ A tuple containing the gRPC and HTTP services for the nemotron_parse endpoint.
110
+ Either the gRPC or HTTP service can be empty, but not both.
111
+
112
+ Methods
113
+ -------
114
+ validate_endpoints(values)
115
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
116
+
117
+ Raises
118
+ ------
119
+ ValueError
120
+ If both gRPC and HTTP services are empty for any endpoint.
121
+
122
+ Config
123
+ ------
124
+ extra : str
125
+ Pydantic config option to forbid extra fields.
126
+ """
127
+
128
+ auth_token: Optional[str] = Field(default=None, repr=False)
129
+
130
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
131
+ yolox_infer_protocol: str = ""
132
+
133
+ nemotron_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
134
+ nemotron_parse_infer_protocol: str = ""
135
+
136
+ nemotron_parse_model_name: str = "nvidia/nemotron-parse"
137
+
138
+ timeout: float = 300.0
139
+
140
+ workers_per_progress_engine: int = 5
141
+
142
+ @model_validator(mode="before")
143
+ @classmethod
144
+ def validate_endpoints(cls, values):
145
+ """
146
+ Validates the gRPC and HTTP services for all endpoints.
147
+
148
+ Parameters
149
+ ----------
150
+ values : dict
151
+ Dictionary containing the values of the attributes for the class.
152
+
153
+ Returns
154
+ -------
155
+ dict
156
+ The validated dictionary of values.
157
+
158
+ Raises
159
+ ------
160
+ ValueError
161
+ If both gRPC and HTTP services are empty for any endpoint.
162
+ """
163
+
164
+ for model_name in ["nemotron_parse"]:
165
+ endpoint_name = f"{model_name}_endpoints"
166
+ grpc_service, http_service = values.get(endpoint_name, ("", ""))
167
+ grpc_service = _clean_service(grpc_service)
168
+ http_service = _clean_service(http_service)
169
+
170
+ if not grpc_service and not http_service:
171
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
172
+
173
+ values[endpoint_name] = (grpc_service, http_service)
174
+
175
+ # Auto-infer protocol from endpoints if not specified
176
+ protocol_name = f"{model_name}_infer_protocol"
177
+ protocol_value = values.get(protocol_name)
178
+ if not protocol_value:
179
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
180
+ values[protocol_name] = protocol_value
181
+
182
+ return values
183
+
184
+ model_config = ConfigDict(extra="forbid")
185
+
186
+
187
+ class PDFExtractorSchema(BaseModel):
188
+ """
189
+ Configuration schema for the PDF extractor settings.
190
+
191
+ Parameters
192
+ ----------
193
+ max_queue_size : int, default=1
194
+ The maximum number of items allowed in the processing queue.
195
+
196
+ n_workers : int, default=16
197
+ The number of worker threads to use for processing.
198
+
199
+ raise_on_failure : bool, default=False
200
+ A flag indicating whether to raise an exception on processing failure.
201
+
202
+ pdfium_config : Optional[PDFiumConfigSchema], default=None
203
+ Configuration for the PDFium service endpoints.
204
+ """
205
+
206
+ max_queue_size: int = 1
207
+ n_workers: int = 16
208
+ raise_on_failure: bool = False
209
+
210
+ pdfium_config: Optional[PDFiumConfigSchema] = None
211
+ nemotron_parse_config: Optional[NemotronParseConfigSchema] = None
212
+
213
+ model_config = ConfigDict(extra="forbid")
214
+
215
+
216
+ def _clean_service(service):
217
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
218
+ if service is None or not service.strip() or service.strip(" \"'") == "":
219
+ return None
220
+ return service
@@ -0,0 +1,128 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel, Field
11
+
12
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
13
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PPTXConfigSchema(LowercaseProtocolMixin):
19
+ """
20
+ Configuration schema for docx extraction endpoints and options.
21
+
22
+ Parameters
23
+ ----------
24
+ auth_token : Optional[str], default=None
25
+ Authentication token required for secure services.
26
+
27
+ yolox_endpoints : Tuple[str, str]
28
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
29
+ Either the gRPC or HTTP service can be empty, but not both.
30
+
31
+ Methods
32
+ -------
33
+ validate_endpoints(values)
34
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
35
+
36
+ Raises
37
+ ------
38
+ ValueError
39
+ If both gRPC and HTTP services are empty for any endpoint.
40
+
41
+ Config
42
+ ------
43
+ extra : str
44
+ Pydantic config option to forbid extra fields.
45
+ """
46
+
47
+ auth_token: Optional[str] = Field(default=None, repr=False)
48
+
49
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
50
+ yolox_infer_protocol: str = ""
51
+
52
+ @model_validator(mode="before")
53
+ @classmethod
54
+ def validate_endpoints(cls, values):
55
+ """
56
+ Validates the gRPC and HTTP services for all endpoints.
57
+
58
+ Parameters
59
+ ----------
60
+ values : dict
61
+ Dictionary containing the values of the attributes for the class.
62
+
63
+ Returns
64
+ -------
65
+ dict
66
+ The validated dictionary of values.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If both gRPC and HTTP services are empty for any endpoint.
72
+ """
73
+
74
+ def clean_service(service):
75
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
76
+ if service is None or not service.strip() or service.strip(" \"'") == "":
77
+ return None
78
+ return service
79
+
80
+ for model_name in ["yolox"]:
81
+ endpoint_name = f"{model_name}_endpoints"
82
+ grpc_service, http_service = values.get(endpoint_name)
83
+ grpc_service = clean_service(grpc_service)
84
+ http_service = clean_service(http_service)
85
+
86
+ if not grpc_service and not http_service:
87
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
88
+
89
+ values[endpoint_name] = (grpc_service, http_service)
90
+
91
+ # Auto-infer protocol from endpoints if not specified
92
+ protocol_name = f"{model_name}_infer_protocol"
93
+ protocol_value = values.get(protocol_name)
94
+ if not protocol_value:
95
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
96
+ values[protocol_name] = protocol_value
97
+
98
+ return values
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+
103
+ class PPTXExtractorSchema(BaseModel):
104
+ """
105
+ Configuration schema for the PDF extractor settings.
106
+
107
+ Parameters
108
+ ----------
109
+ max_queue_size : int, default=1
110
+ The maximum number of items allowed in the processing queue.
111
+
112
+ n_workers : int, default=16
113
+ The number of worker threads to use for processing.
114
+
115
+ raise_on_failure : bool, default=False
116
+ A flag indicating whether to raise an exception on processing failure.
117
+
118
+ image_extraction_config: Optional[ImageConfigSchema], default=None
119
+ Configuration schema for the image extraction stage.
120
+ """
121
+
122
+ max_queue_size: int = 1
123
+ n_workers: int = 16
124
+ raise_on_failure: bool = False
125
+
126
+ pptx_extraction_config: Optional[PPTXConfigSchema] = None
127
+ pdfium_config: Optional[PDFiumConfigSchema] = None
128
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,137 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
11
+
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class TableExtractorConfigSchema(LowercaseProtocolMixin):
18
+ """
19
+ Configuration schema for the table extraction stage settings.
20
+
21
+ Parameters
22
+ ----------
23
+ auth_token : Optional[str], default=None
24
+ Authentication token required for secure services.
25
+
26
+ ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
27
+ A tuple containing the gRPC and HTTP services for the ocr endpoint.
28
+ Either the gRPC or HTTP service can be empty, but not both.
29
+
30
+ Methods
31
+ -------
32
+ validate_endpoints(values)
33
+ Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.
34
+
35
+ Raises
36
+ ------
37
+ ValueError
38
+ If both gRPC and HTTP services are empty for the yolox endpoint.
39
+
40
+ Config
41
+ ------
42
+ extra : str
43
+ Pydantic config option to forbid extra fields.
44
+ """
45
+
46
+ auth_token: Optional[str] = Field(default=None, repr=False)
47
+
48
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
49
+ yolox_infer_protocol: str = ""
50
+
51
+ ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
52
+ ocr_infer_protocol: str = ""
53
+
54
+ nim_batch_size: int = 2
55
+ workers_per_progress_engine: int = 5
56
+
57
+ @model_validator(mode="before")
58
+ @classmethod
59
+ def validate_endpoints(cls, values):
60
+ """
61
+ Validates the gRPC and HTTP services for the yolox endpoint.
62
+
63
+ Parameters
64
+ ----------
65
+ values : dict
66
+ Dictionary containing the values of the attributes for the class.
67
+
68
+ Returns
69
+ -------
70
+ dict
71
+ The validated dictionary of values.
72
+
73
+ Raises
74
+ ------
75
+ ValueError
76
+ If both gRPC and HTTP services are empty for the yolox endpoint.
77
+ """
78
+
79
+ def clean_service(service):
80
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
81
+ if service is None or not service.strip() or service.strip(" \"'") == "":
82
+ return None
83
+ return service
84
+
85
+ for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
86
+ grpc_service, http_service = values.get(endpoint_name, (None, None))
87
+ grpc_service = clean_service(grpc_service)
88
+ http_service = clean_service(http_service)
89
+
90
+ if not grpc_service and not http_service:
91
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
92
+
93
+ values[endpoint_name] = (grpc_service, http_service)
94
+
95
+ # Auto-infer protocol from endpoints if not specified
96
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
97
+ protocol_value = values.get(protocol_name)
98
+ if not protocol_value:
99
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
100
+ values[protocol_name] = protocol_value
101
+
102
+ return values
103
+
104
+ model_config = ConfigDict(extra="forbid")
105
+
106
+
107
+ class TableExtractorSchema(BaseModel):
108
+ """
109
+ Configuration schema for the table extraction processing settings.
110
+
111
+ Parameters
112
+ ----------
113
+ max_queue_size : int, default=1
114
+ The maximum number of items allowed in the processing queue.
115
+
116
+ n_workers : int, default=2
117
+ The number of worker threads to use for processing.
118
+
119
+ raise_on_failure : bool, default=False
120
+ A flag indicating whether to raise an exception if a failure occurs during table extraction.
121
+
122
+ stage_config : Optional[TableExtractorConfigSchema], default=None
123
+ Configuration for the table extraction stage, including yolox service endpoints.
124
+ """
125
+
126
+ max_queue_size: int = 1
127
+ n_workers: int = 2
128
+ raise_on_failure: bool = False
129
+
130
+ @field_validator("max_queue_size", "n_workers")
131
+ def check_positive(cls, v, field):
132
+ if v <= 0:
133
+ raise ValueError(f"{field.field_name} must be greater than 0.")
134
+ return v
135
+
136
+ endpoint_config: Optional[TableExtractorConfigSchema] = None
137
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,37 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from pydantic import BaseModel, Field
6
+ from typing import Optional, Literal, Annotated
7
+
8
+
9
+ class MessageBrokerClientSchema(BaseModel):
10
+ """
11
+ Configuration schema for message broker client connections.
12
+ Supports Redis or simple in-memory clients.
13
+ """
14
+
15
+ host: str = Field(default="redis", description="Hostname of the broker service.")
16
+
17
+ port: Annotated[int, Field(gt=0, lt=65536)] = Field(
18
+ default=6379, description="Port to connect to. Must be between 1 and 65535."
19
+ )
20
+
21
+ client_type: Literal["redis", "simple"] = Field(
22
+ default="redis", description="Type of broker client. Supported values: 'redis', 'simple'."
23
+ )
24
+
25
+ broker_params: Optional[dict] = Field(
26
+ default_factory=dict, description="Optional parameters passed to the broker client."
27
+ )
28
+
29
+ connection_timeout: Annotated[int, Field(ge=0)] = Field(
30
+ default=300, description="Connection timeout in seconds. Must be >= 0."
31
+ )
32
+
33
+ max_backoff: Annotated[int, Field(ge=0)] = Field(
34
+ default=300, description="Maximum backoff time in seconds. Must be >= 0."
35
+ )
36
+
37
+ max_retries: Annotated[int, Field(ge=0)] = Field(default=0, description="Maximum number of retries. Must be >= 0.")
@@ -0,0 +1,34 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+
9
+ from pydantic import ConfigDict, BaseModel
10
+ from pydantic import Field
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ # Define schemas for request validation
16
+ class PushRequestSchema(BaseModel):
17
+ command: str
18
+ queue_name: str = Field(..., min_length=1)
19
+ message: str = Field(..., min_length=1)
20
+ timeout: Optional[float] = 100 # Optional timeout for blocking push
21
+ model_config = ConfigDict(extra="forbid")
22
+
23
+
24
+ class PopRequestSchema(BaseModel):
25
+ command: str
26
+ queue_name: str = Field(..., min_length=1)
27
+ timeout: Optional[float] = 100 # Optional timeout for blocking pop
28
+ model_config = ConfigDict(extra="forbid")
29
+
30
+
31
+ class SizeRequestSchema(BaseModel):
32
+ command: str
33
+ queue_name: str = Field(..., min_length=1)
34
+ model_config = ConfigDict(extra="forbid")