nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,133 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import BaseModel, Field
11
+ from pydantic import root_validator
12
+
13
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class AudioConfigSchema(LowercaseProtocolMixin):
19
+ """
20
+ Configuration schema for audio extraction endpoints and options.
21
+
22
+ Parameters
23
+ ----------
24
+ auth_token : Optional[str], default=None
25
+ Authentication token required for secure services.
26
+
27
+ audio_endpoints : Tuple[str, str]
28
+ A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.
29
+ Either the gRPC or HTTP service can be empty, but not both.
30
+
31
+ Methods
32
+ -------
33
+ validate_endpoints(values)
34
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
35
+
36
+ Raises
37
+ ------
38
+ ValueError
39
+ If both gRPC and HTTP services are empty for any endpoint.
40
+
41
+ Config
42
+ ------
43
+ extra : str
44
+ Pydantic config option to forbid extra fields.
45
+ """
46
+
47
+ auth_token: Optional[str] = Field(default=None, repr=False)
48
+ audio_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
49
+ audio_infer_protocol: Optional[str] = None
50
+ function_id: Optional[str] = None
51
+ use_ssl: Optional[bool] = None
52
+ ssl_cert: Optional[str] = Field(default=None, repr=False)
53
+ segment_audio: Optional[bool] = None
54
+
55
+ @root_validator(pre=True)
56
+ def validate_endpoints(cls, values):
57
+ """
58
+ Validates the gRPC and HTTP services for all endpoints.
59
+
60
+ Parameters
61
+ ----------
62
+ values : dict
63
+ Dictionary containing the values of the attributes for the class.
64
+
65
+ Returns
66
+ -------
67
+ dict
68
+ The validated dictionary of values.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If both gRPC and HTTP services are empty for any endpoint.
74
+ """
75
+
76
+ def clean_service(service):
77
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
78
+ if service is None or not service.strip() or service.strip(" \"'") == "":
79
+ return None
80
+ return service
81
+
82
+ endpoint_name = "audio_endpoints"
83
+ grpc_service, http_service = values.get(endpoint_name)
84
+ grpc_service = clean_service(grpc_service)
85
+ http_service = clean_service(http_service)
86
+
87
+ if not grpc_service and not http_service:
88
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
89
+
90
+ values[endpoint_name] = (grpc_service, http_service)
91
+
92
+ # Auto-infer protocol from endpoints if not specified
93
+ protocol_name = "audio_infer_protocol"
94
+ protocol_value = values.get(protocol_name)
95
+
96
+ if not protocol_value:
97
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
98
+
99
+ values[protocol_name] = protocol_value
100
+
101
+ return values
102
+
103
+ class Config:
104
+ extra = "forbid"
105
+
106
+
107
+ class AudioExtractorSchema(BaseModel):
108
+ """
109
+ Configuration schema for the PDF extractor settings.
110
+
111
+ Parameters
112
+ ----------
113
+ max_queue_size : int, default=1
114
+ The maximum number of items allowed in the processing queue.
115
+
116
+ n_workers : int, default=16
117
+ The number of worker threads to use for processing.
118
+
119
+ raise_on_failure : bool, default=False
120
+ A flag indicating whether to raise an exception on processing failure.
121
+
122
+ audio_extraction_config: Optional[AudioConfigSchema], default=None
123
+ Configuration schema for the audio extraction stage.
124
+ """
125
+
126
+ max_queue_size: int = 1
127
+ n_workers: int = 16
128
+ raise_on_failure: bool = False
129
+
130
+ audio_extraction_config: Optional[AudioConfigSchema] = None
131
+
132
+ class Config:
133
+ extra = "forbid"
@@ -0,0 +1,144 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Optional
7
+ from typing import Tuple
8
+
9
+ from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
10
+
11
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ChartExtractorConfigSchema(LowercaseProtocolMixin):
17
+ """
18
+ Configuration schema for chart extraction service endpoints and options.
19
+
20
+ Parameters
21
+ ----------
22
+ auth_token : Optional[str], default=None
23
+ Authentication token required for secure services.
24
+
25
+ yolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
26
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
27
+ Either the gRPC or HTTP service can be empty, but not both.
28
+
29
+ ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
30
+ A tuple containing the gRPC and HTTP services for the ocr endpoint.
31
+ Either the gRPC or HTTP service can be empty, but not both.
32
+
33
+ Methods
34
+ -------
35
+ validate_endpoints(values)
36
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
37
+
38
+ Raises
39
+ ------
40
+ ValueError
41
+ If both gRPC and HTTP services are empty for any endpoint.
42
+
43
+ Config
44
+ ------
45
+ extra : str
46
+ Pydantic config option to forbid extra fields.
47
+ """
48
+
49
+ auth_token: Optional[str] = Field(default=None, repr=False)
50
+
51
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
52
+ yolox_infer_protocol: str = ""
53
+
54
+ ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
55
+ ocr_infer_protocol: str = ""
56
+
57
+ nim_batch_size: int = 2
58
+ workers_per_progress_engine: int = 5
59
+
60
+ @model_validator(mode="before")
61
+ @classmethod
62
+ def validate_endpoints(cls, values):
63
+ """
64
+ Validates the gRPC and HTTP services for all endpoints.
65
+
66
+ Ensures that at least one service (either gRPC or HTTP) is provided
67
+ for each endpoint in the configuration.
68
+
69
+ Parameters
70
+ ----------
71
+ values : dict
72
+ Dictionary containing the values of the attributes for the class.
73
+
74
+ Returns
75
+ -------
76
+ dict
77
+ The validated dictionary of values.
78
+
79
+ Raises
80
+ ------
81
+ ValueError
82
+ If both gRPC and HTTP services are empty for any endpoint.
83
+ """
84
+
85
+ def clean_service(service):
86
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
87
+ if service is None or not service.strip() or service.strip(" \"'") == "":
88
+ return None
89
+ return service
90
+
91
+ for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
92
+ grpc_service, http_service = values.get(endpoint_name, (None, None))
93
+ grpc_service = clean_service(grpc_service)
94
+ http_service = clean_service(http_service)
95
+
96
+ if not grpc_service and not http_service:
97
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
98
+
99
+ values[endpoint_name] = (grpc_service, http_service)
100
+
101
+ # Auto-infer protocol from endpoints if not specified
102
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
103
+ protocol_value = values.get(protocol_name)
104
+ if not protocol_value:
105
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
106
+ values[protocol_name] = protocol_value
107
+
108
+ return values
109
+
110
+ model_config = ConfigDict(extra="forbid")
111
+
112
+
113
+ class ChartExtractorSchema(BaseModel):
114
+ """
115
+ Configuration schema for chart extraction processing settings.
116
+
117
+ Parameters
118
+ ----------
119
+ max_queue_size : int, default=1
120
+ The maximum number of items allowed in the processing queue.
121
+
122
+ n_workers : int, default=2
123
+ The number of worker threads to use for processing.
124
+
125
+ raise_on_failure : bool, default=False
126
+ A flag indicating whether to raise an exception if a failure occurs during chart extraction.
127
+
128
+ extraction_config: Optional[ChartExtractorConfigSchema], default=None
129
+ Configuration for the chart extraction stage, including yolox and ocr service endpoints.
130
+ """
131
+
132
+ max_queue_size: int = 1
133
+ n_workers: int = 2
134
+ raise_on_failure: bool = False
135
+
136
+ endpoint_config: Optional[ChartExtractorConfigSchema] = None
137
+
138
+ @field_validator("max_queue_size", "n_workers")
139
+ def check_positive(cls, v, field):
140
+ if v <= 0:
141
+ raise ValueError(f"{field.field_name} must be greater than 0.")
142
+ return v
143
+
144
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,129 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel, Field
11
+
12
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
13
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DocxConfigSchema(LowercaseProtocolMixin):
19
+ """
20
+ Configuration schema for docx extraction endpoints and options.
21
+
22
+ Parameters
23
+ ----------
24
+ auth_token : Optional[str], default=None
25
+ Authentication token required for secure services.
26
+
27
+ yolox_endpoints : Tuple[str, str]
28
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
29
+ Either the gRPC or HTTP service can be empty, but not both.
30
+
31
+ Methods
32
+ -------
33
+ validate_endpoints(values)
34
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
35
+
36
+ Raises
37
+ ------
38
+ ValueError
39
+ If both gRPC and HTTP services are empty for any endpoint.
40
+
41
+ Config
42
+ ------
43
+ extra : str
44
+ Pydantic config option to forbid extra fields.
45
+ """
46
+
47
+ auth_token: Optional[str] = Field(default=None, repr=False)
48
+
49
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
50
+ yolox_infer_protocol: str = ""
51
+
52
+ @model_validator(mode="before")
53
+ @classmethod
54
+ def validate_endpoints(cls, values):
55
+ """
56
+ Validates the gRPC and HTTP services for all endpoints.
57
+
58
+ Parameters
59
+ ----------
60
+ values : dict
61
+ Dictionary containing the values of the attributes for the class.
62
+
63
+ Returns
64
+ -------
65
+ dict
66
+ The validated dictionary of values.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If both gRPC and HTTP services are empty for any endpoint.
72
+ """
73
+
74
+ def clean_service(service):
75
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
76
+ if service is None or not service.strip() or service.strip(" \"'") == "":
77
+ return None
78
+ return service
79
+
80
+ for model_name in ["yolox"]:
81
+ endpoint_name = f"{model_name}_endpoints"
82
+ grpc_service, http_service = values.get(endpoint_name)
83
+ grpc_service = clean_service(grpc_service)
84
+ http_service = clean_service(http_service)
85
+
86
+ if not grpc_service and not http_service:
87
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
88
+
89
+ values[endpoint_name] = (grpc_service, http_service)
90
+
91
+ # Auto-infer protocol from endpoints if not specified
92
+ protocol_name = f"{model_name}_infer_protocol"
93
+ protocol_value = values.get(protocol_name)
94
+ if not protocol_value:
95
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
96
+ values[protocol_name] = protocol_value
97
+
98
+ return values
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+
103
+ class DocxExtractorSchema(BaseModel):
104
+ """
105
+ Configuration schema for the PDF extractor settings.
106
+
107
+ Parameters
108
+ ----------
109
+ max_queue_size : int, default=1
110
+ The maximum number of items allowed in the processing queue.
111
+
112
+ n_workers : int, default=16
113
+ The number of worker threads to use for processing.
114
+
115
+ raise_on_failure : bool, default=False
116
+ A flag indicating whether to raise an exception on processing failure.
117
+
118
+ image_extraction_config: Optional[ImageConfigSchema], default=None
119
+ Configuration schema for the image extraction stage.
120
+ """
121
+
122
+ max_queue_size: int = 1
123
+ n_workers: int = 16
124
+ raise_on_failure: bool = False
125
+
126
+ docx_extraction_config: Optional[DocxConfigSchema] = None
127
+ pdfium_config: Optional[PDFiumConfigSchema] = None
128
+
129
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,34 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class HtmlExtractorSchema(BaseModel):
14
+ """
15
+ Configuration schema for the Html extractor settings.
16
+
17
+ Parameters
18
+ ----------
19
+ max_queue_size : int, default=1
20
+ The maximum number of items allowed in the processing queue.
21
+
22
+ n_workers : int, default=16
23
+ The number of worker threads to use for processing.
24
+
25
+ raise_on_failure : bool, default=False
26
+ A flag indicating whether to raise an exception on processing failure.
27
+
28
+ """
29
+
30
+ max_queue_size: int = 1
31
+ n_workers: int = 16
32
+ raise_on_failure: bool = False
33
+
34
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,126 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel, Field
11
+
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ImageConfigSchema(LowercaseProtocolMixin):
18
+ """
19
+ Configuration schema for image extraction endpoints and options.
20
+
21
+ Parameters
22
+ ----------
23
+ auth_token : Optional[str], default=None
24
+ Authentication token required for secure services.
25
+
26
+ yolox_endpoints : Tuple[str, str]
27
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
28
+ Either the gRPC or HTTP service can be empty, but not both.
29
+
30
+ Methods
31
+ -------
32
+ validate_endpoints(values)
33
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
34
+
35
+ Raises
36
+ ------
37
+ ValueError
38
+ If both gRPC and HTTP services are empty for any endpoint.
39
+
40
+ Config
41
+ ------
42
+ extra : str
43
+ Pydantic config option to forbid extra fields.
44
+ """
45
+
46
+ auth_token: Optional[str] = Field(default=None, repr=False)
47
+
48
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
49
+ yolox_infer_protocol: str = ""
50
+
51
+ @model_validator(mode="before")
52
+ @classmethod
53
+ def validate_endpoints(cls, values):
54
+ """
55
+ Validates the gRPC and HTTP services for all endpoints.
56
+
57
+ Parameters
58
+ ----------
59
+ values : dict
60
+ Dictionary containing the values of the attributes for the class.
61
+
62
+ Returns
63
+ -------
64
+ dict
65
+ The validated dictionary of values.
66
+
67
+ Raises
68
+ ------
69
+ ValueError
70
+ If both gRPC and HTTP services are empty for any endpoint.
71
+ """
72
+
73
+ def clean_service(service):
74
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
75
+ if service is None or not service.strip() or service.strip(" \"'") == "":
76
+ return None
77
+ return service
78
+
79
+ for model_name in ["yolox"]:
80
+ endpoint_name = f"{model_name}_endpoints"
81
+ grpc_service, http_service = values.get(endpoint_name)
82
+ grpc_service = clean_service(grpc_service)
83
+ http_service = clean_service(http_service)
84
+
85
+ if not grpc_service and not http_service:
86
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
87
+
88
+ values[endpoint_name] = (grpc_service, http_service)
89
+
90
+ # Auto-infer protocol from endpoints if not specified
91
+ protocol_name = f"{model_name}_infer_protocol"
92
+ protocol_value = values.get(protocol_name)
93
+ if not protocol_value:
94
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
95
+ values[protocol_name] = protocol_value
96
+
97
+ return values
98
+
99
+ model_config = ConfigDict(extra="forbid")
100
+
101
+
102
+ class ImageExtractorSchema(BaseModel):
103
+ """
104
+ Configuration schema for the PDF extractor settings.
105
+
106
+ Parameters
107
+ ----------
108
+ max_queue_size : int, default=1
109
+ The maximum number of items allowed in the processing queue.
110
+
111
+ n_workers : int, default=16
112
+ The number of worker threads to use for processing.
113
+
114
+ raise_on_failure : bool, default=False
115
+ A flag indicating whether to raise an exception on processing failure.
116
+
117
+ image_extraction_config: Optional[ImageConfigSchema], default=None
118
+ Configuration schema for the image extraction stage.
119
+ """
120
+
121
+ max_queue_size: int = 1
122
+ n_workers: int = 16
123
+ raise_on_failure: bool = False
124
+
125
+ image_extraction_config: Optional[ImageConfigSchema] = None
126
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,137 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Optional
7
+ from typing import Tuple
8
+
9
+ from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
10
+
11
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class InfographicExtractorConfigSchema(LowercaseProtocolMixin):
17
+ """
18
+ Configuration schema for infographic extraction service endpoints and options.
19
+
20
+ Parameters
21
+ ----------
22
+ auth_token : Optional[str], default=None
23
+ Authentication token required for secure services.
24
+
25
+ ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
26
+ A tuple containing the gRPC and HTTP services for the ocr endpoint.
27
+ Either the gRPC or HTTP service can be empty, but not both.
28
+
29
+ Methods
30
+ -------
31
+ validate_endpoints(values)
32
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
33
+
34
+ Raises
35
+ ------
36
+ ValueError
37
+ If both gRPC and HTTP services are empty for any endpoint.
38
+
39
+ Config
40
+ ------
41
+ extra : str
42
+ Pydantic config option to forbid extra fields.
43
+ """
44
+
45
+ auth_token: Optional[str] = Field(default=None, repr=False)
46
+
47
+ ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
48
+ ocr_infer_protocol: str = ""
49
+
50
+ nim_batch_size: int = 2
51
+ workers_per_progress_engine: int = 5
52
+
53
+ @model_validator(mode="before")
54
+ @classmethod
55
+ def validate_endpoints(cls, values):
56
+ """
57
+ Validates the gRPC and HTTP services for all endpoints.
58
+
59
+ Ensures that at least one service (either gRPC or HTTP) is provided
60
+ for each endpoint in the configuration.
61
+
62
+ Parameters
63
+ ----------
64
+ values : dict
65
+ Dictionary containing the values of the attributes for the class.
66
+
67
+ Returns
68
+ -------
69
+ dict
70
+ The validated dictionary of values.
71
+
72
+ Raises
73
+ ------
74
+ ValueError
75
+ If both gRPC and HTTP services are empty for any endpoint.
76
+ """
77
+
78
+ def clean_service(service):
79
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
80
+ if service is None or not service.strip() or service.strip(" \"'") == "":
81
+ return None
82
+ return service
83
+
84
+ for endpoint_name in ["ocr_endpoints"]:
85
+ grpc_service, http_service = values.get(endpoint_name, (None, None))
86
+ grpc_service = clean_service(grpc_service)
87
+ http_service = clean_service(http_service)
88
+
89
+ if not grpc_service and not http_service:
90
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
91
+
92
+ values[endpoint_name] = (grpc_service, http_service)
93
+
94
+ # Auto-infer protocol from endpoints if not specified
95
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
96
+ protocol_value = values.get(protocol_name)
97
+ if not protocol_value:
98
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
99
+ values[protocol_name] = protocol_value
100
+
101
+ return values
102
+
103
+ model_config = ConfigDict(extra="forbid")
104
+
105
+
106
+ class InfographicExtractorSchema(BaseModel):
107
+ """
108
+ Configuration schema for infographic extraction processing settings.
109
+
110
+ Parameters
111
+ ----------
112
+ max_queue_size : int, default=1
113
+ The maximum number of items allowed in the processing queue.
114
+
115
+ n_workers : int, default=2
116
+ The number of worker threads to use for processing.
117
+
118
+ raise_on_failure : bool, default=False
119
+ A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
120
+
121
+ stage_config : Optional[InfographicExtractorConfigSchema], default=None
122
+ Configuration for the infographic extraction stage, including yolox and ocr service endpoints.
123
+ """
124
+
125
+ max_queue_size: int = 1
126
+ n_workers: int = 2
127
+ raise_on_failure: bool = False
128
+
129
+ endpoint_config: Optional[InfographicExtractorConfigSchema] = None
130
+
131
+ @field_validator("max_queue_size", "n_workers")
132
+ def check_positive(cls, v, field):
133
+ if v <= 0:
134
+ raise ValueError(f"{field.field_name} must be greater than 0.")
135
+ return v
136
+
137
+ model_config = ConfigDict(extra="forbid")