nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +72 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,124 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ImageConfigSchema(BaseModel):
16
+ """
17
+ Configuration schema for image extraction endpoints and options.
18
+
19
+ Parameters
20
+ ----------
21
+ auth_token : Optional[str], default=None
22
+ Authentication token required for secure services.
23
+
24
+ yolox_endpoints : Tuple[str, str]
25
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
26
+ Either the gRPC or HTTP service can be empty, but not both.
27
+
28
+ Methods
29
+ -------
30
+ validate_endpoints(values)
31
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
32
+
33
+ Raises
34
+ ------
35
+ ValueError
36
+ If both gRPC and HTTP services are empty for any endpoint.
37
+
38
+ Config
39
+ ------
40
+ extra : str
41
+ Pydantic config option to forbid extra fields.
42
+ """
43
+
44
+ auth_token: Optional[str] = None
45
+
46
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
47
+ yolox_infer_protocol: str = ""
48
+
49
+ @model_validator(mode="before")
50
+ @classmethod
51
+ def validate_endpoints(cls, values):
52
+ """
53
+ Validates the gRPC and HTTP services for all endpoints.
54
+
55
+ Parameters
56
+ ----------
57
+ values : dict
58
+ Dictionary containing the values of the attributes for the class.
59
+
60
+ Returns
61
+ -------
62
+ dict
63
+ The validated dictionary of values.
64
+
65
+ Raises
66
+ ------
67
+ ValueError
68
+ If both gRPC and HTTP services are empty for any endpoint.
69
+ """
70
+
71
+ def clean_service(service):
72
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
73
+ if service is None or not service.strip() or service.strip(" \"'") == "":
74
+ return None
75
+ return service
76
+
77
+ for model_name in ["yolox"]:
78
+ endpoint_name = f"{model_name}_endpoints"
79
+ grpc_service, http_service = values.get(endpoint_name)
80
+ grpc_service = clean_service(grpc_service)
81
+ http_service = clean_service(http_service)
82
+
83
+ if not grpc_service and not http_service:
84
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
85
+
86
+ values[endpoint_name] = (grpc_service, http_service)
87
+
88
+ protocol_name = f"{model_name}_infer_protocol"
89
+ protocol_value = values.get(protocol_name)
90
+ if not protocol_value:
91
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
+ protocol_value = protocol_value.lower()
93
+ values[protocol_name] = protocol_value
94
+
95
+ return values
96
+
97
+ model_config = ConfigDict(extra="forbid")
98
+
99
+
100
+ class ImageExtractorSchema(BaseModel):
101
+ """
102
+ Configuration schema for the PDF extractor settings.
103
+
104
+ Parameters
105
+ ----------
106
+ max_queue_size : int, default=1
107
+ The maximum number of items allowed in the processing queue.
108
+
109
+ n_workers : int, default=16
110
+ The number of worker threads to use for processing.
111
+
112
+ raise_on_failure : bool, default=False
113
+ A flag indicating whether to raise an exception on processing failure.
114
+
115
+ image_extraction_config: Optional[ImageConfigSchema], default=None
116
+ Configuration schema for the image extraction stage.
117
+ """
118
+
119
+ max_queue_size: int = 1
120
+ n_workers: int = 16
121
+ raise_on_failure: bool = False
122
+
123
+ image_extraction_config: Optional[ImageConfigSchema] = None
124
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,128 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Optional
7
+ from typing import Tuple
8
+
9
+ from pydantic import field_validator, model_validator, ConfigDict, BaseModel
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class InfographicExtractorConfigSchema(BaseModel):
15
+ """
16
+ Configuration schema for infographic extraction service endpoints and options.
17
+
18
+ Parameters
19
+ ----------
20
+ auth_token : Optional[str], default=None
21
+ Authentication token required for secure services.
22
+
23
+ paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
24
+ A tuple containing the gRPC and HTTP services for the paddle endpoint.
25
+ Either the gRPC or HTTP service can be empty, but not both.
26
+
27
+ Methods
28
+ -------
29
+ validate_endpoints(values)
30
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
31
+
32
+ Raises
33
+ ------
34
+ ValueError
35
+ If both gRPC and HTTP services are empty for any endpoint.
36
+
37
+ Config
38
+ ------
39
+ extra : str
40
+ Pydantic config option to forbid extra fields.
41
+ """
42
+
43
+ auth_token: Optional[str] = None
44
+
45
+ paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
46
+ paddle_infer_protocol: str = ""
47
+
48
+ nim_batch_size: int = 2
49
+ workers_per_progress_engine: int = 5
50
+
51
+ @model_validator(mode="before")
52
+ @classmethod
53
+ def validate_endpoints(cls, values):
54
+ """
55
+ Validates the gRPC and HTTP services for all endpoints.
56
+
57
+ Ensures that at least one service (either gRPC or HTTP) is provided
58
+ for each endpoint in the configuration.
59
+
60
+ Parameters
61
+ ----------
62
+ values : dict
63
+ Dictionary containing the values of the attributes for the class.
64
+
65
+ Returns
66
+ -------
67
+ dict
68
+ The validated dictionary of values.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If both gRPC and HTTP services are empty for any endpoint.
74
+ """
75
+
76
+ def clean_service(service):
77
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
78
+ if service is None or not service.strip() or service.strip(" \"'") == "":
79
+ return None
80
+ return service
81
+
82
+ for endpoint_name in ["paddle_endpoints"]:
83
+ grpc_service, http_service = values.get(endpoint_name, (None, None))
84
+ grpc_service = clean_service(grpc_service)
85
+ http_service = clean_service(http_service)
86
+
87
+ if not grpc_service and not http_service:
88
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
89
+
90
+ values[endpoint_name] = (grpc_service, http_service)
91
+
92
+ return values
93
+
94
+ model_config = ConfigDict(extra="forbid")
95
+
96
+
97
+ class InfographicExtractorSchema(BaseModel):
98
+ """
99
+ Configuration schema for infographic extraction processing settings.
100
+
101
+ Parameters
102
+ ----------
103
+ max_queue_size : int, default=1
104
+ The maximum number of items allowed in the processing queue.
105
+
106
+ n_workers : int, default=2
107
+ The number of worker threads to use for processing.
108
+
109
+ raise_on_failure : bool, default=False
110
+ A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
111
+
112
+ stage_config : Optional[InfographicExtractorConfigSchema], default=None
113
+ Configuration for the infographic extraction stage, including yolox and paddle service endpoints.
114
+ """
115
+
116
+ max_queue_size: int = 1
117
+ n_workers: int = 2
118
+ raise_on_failure: bool = False
119
+
120
+ endpoint_config: Optional[InfographicExtractorConfigSchema] = None
121
+
122
+ @field_validator("max_queue_size", "n_workers")
123
+ def check_positive(cls, v, field):
124
+ if v <= 0:
125
+ raise ValueError(f"{field.field_name} must be greater than 10.")
126
+ return v
127
+
128
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,218 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PDFiumConfigSchema(BaseModel):
16
+ """
17
+ Configuration schema for PDFium endpoints and options.
18
+
19
+ Parameters
20
+ ----------
21
+ auth_token : Optional[str], default=None
22
+ Authentication token required for secure services.
23
+
24
+ yolox_endpoints : Tuple[str, str]
25
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
26
+ Either the gRPC or HTTP service can be empty, but not both.
27
+
28
+ Methods
29
+ -------
30
+ validate_endpoints(values)
31
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
32
+
33
+ Raises
34
+ ------
35
+ ValueError
36
+ If both gRPC and HTTP services are empty for any endpoint.
37
+
38
+ Config
39
+ ------
40
+ extra : str
41
+ Pydantic config option to forbid extra fields.
42
+ """
43
+
44
+ auth_token: Optional[str] = None
45
+
46
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
47
+ yolox_infer_protocol: str = ""
48
+
49
+ nim_batch_size: int = 4
50
+ workers_per_progress_engine: int = 5
51
+
52
+ @model_validator(mode="before")
53
+ @classmethod
54
+ def validate_endpoints(cls, values):
55
+ """
56
+ Validates the gRPC and HTTP services for all endpoints.
57
+
58
+ Parameters
59
+ ----------
60
+ values : dict
61
+ Dictionary containing the values of the attributes for the class.
62
+
63
+ Returns
64
+ -------
65
+ dict
66
+ The validated dictionary of values.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If both gRPC and HTTP services are empty for any endpoint.
72
+ """
73
+
74
+ for model_name in ["yolox"]:
75
+ endpoint_name = f"{model_name}_endpoints"
76
+ grpc_service, http_service = values.get(endpoint_name, ("", ""))
77
+ grpc_service = _clean_service(grpc_service)
78
+ http_service = _clean_service(http_service)
79
+
80
+ if not grpc_service and not http_service:
81
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
82
+
83
+ values[endpoint_name] = (grpc_service, http_service)
84
+
85
+ protocol_name = f"{model_name}_infer_protocol"
86
+ protocol_value = values.get(protocol_name)
87
+ if not protocol_value:
88
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
89
+ protocol_value = protocol_value.lower()
90
+ values[protocol_name] = protocol_value
91
+
92
+ return values
93
+
94
+ model_config = ConfigDict(extra="forbid")
95
+
96
+
97
+ class NemoRetrieverParseConfigSchema(BaseModel):
98
+ """
99
+ Configuration schema for NemoRetrieverParse endpoints and options.
100
+
101
+ Parameters
102
+ ----------
103
+ auth_token : Optional[str], default=None
104
+ Authentication token required for secure services.
105
+
106
+ nemoretriever_parse_endpoints : Tuple[str, str]
107
+ A tuple containing the gRPC and HTTP services for the nemoretriever_parse endpoint.
108
+ Either the gRPC or HTTP service can be empty, but not both.
109
+
110
+ Methods
111
+ -------
112
+ validate_endpoints(values)
113
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
114
+
115
+ Raises
116
+ ------
117
+ ValueError
118
+ If both gRPC and HTTP services are empty for any endpoint.
119
+
120
+ Config
121
+ ------
122
+ extra : str
123
+ Pydantic config option to forbid extra fields.
124
+ """
125
+
126
+ auth_token: Optional[str] = None
127
+
128
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
129
+ yolox_infer_protocol: str = ""
130
+
131
+ nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
132
+ nemoretriever_parse_infer_protocol: str = ""
133
+
134
+ model_name: str = "nvidia/nemoretriever-parse"
135
+
136
+ timeout: float = 300.0
137
+
138
+ workers_per_progress_engine: int = 5
139
+
140
+ @model_validator(mode="before")
141
+ @classmethod
142
+ def validate_endpoints(cls, values):
143
+ """
144
+ Validates the gRPC and HTTP services for all endpoints.
145
+
146
+ Parameters
147
+ ----------
148
+ values : dict
149
+ Dictionary containing the values of the attributes for the class.
150
+
151
+ Returns
152
+ -------
153
+ dict
154
+ The validated dictionary of values.
155
+
156
+ Raises
157
+ ------
158
+ ValueError
159
+ If both gRPC and HTTP services are empty for any endpoint.
160
+ """
161
+
162
+ for model_name in ["nemoretriever_parse"]:
163
+ endpoint_name = f"{model_name}_endpoints"
164
+ grpc_service, http_service = values.get(endpoint_name, ("", ""))
165
+ grpc_service = _clean_service(grpc_service)
166
+ http_service = _clean_service(http_service)
167
+
168
+ if not grpc_service and not http_service:
169
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
170
+
171
+ values[endpoint_name] = (grpc_service, http_service)
172
+
173
+ protocol_name = f"{model_name}_infer_protocol"
174
+ protocol_value = values.get(protocol_name)
175
+ if not protocol_value:
176
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
177
+ protocol_value = protocol_value.lower()
178
+ values[protocol_name] = protocol_value
179
+
180
+ return values
181
+
182
+ model_config = ConfigDict(extra="forbid")
183
+
184
+
185
+ class PDFExtractorSchema(BaseModel):
186
+ """
187
+ Configuration schema for the PDF extractor settings.
188
+
189
+ Parameters
190
+ ----------
191
+ max_queue_size : int, default=1
192
+ The maximum number of items allowed in the processing queue.
193
+
194
+ n_workers : int, default=16
195
+ The number of worker threads to use for processing.
196
+
197
+ raise_on_failure : bool, default=False
198
+ A flag indicating whether to raise an exception on processing failure.
199
+
200
+ pdfium_config : Optional[PDFiumConfigSchema], default=None
201
+ Configuration for the PDFium service endpoints.
202
+ """
203
+
204
+ max_queue_size: int = 1
205
+ n_workers: int = 16
206
+ raise_on_failure: bool = False
207
+
208
+ pdfium_config: Optional[PDFiumConfigSchema] = None
209
+ nemoretriever_parse_config: Optional[NemoRetrieverParseConfigSchema] = None
210
+
211
+ model_config = ConfigDict(extra="forbid")
212
+
213
+
214
+ def _clean_service(service):
215
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
216
+ if service is None or not service.strip() or service.strip(" \"'") == "":
217
+ return None
218
+ return service
@@ -0,0 +1,124 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ from pydantic import model_validator, ConfigDict, BaseModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PPTXConfigSchema(BaseModel):
16
+ """
17
+ Configuration schema for docx extraction endpoints and options.
18
+
19
+ Parameters
20
+ ----------
21
+ auth_token : Optional[str], default=None
22
+ Authentication token required for secure services.
23
+
24
+ yolox_endpoints : Tuple[str, str]
25
+ A tuple containing the gRPC and HTTP services for the yolox endpoint.
26
+ Either the gRPC or HTTP service can be empty, but not both.
27
+
28
+ Methods
29
+ -------
30
+ validate_endpoints(values)
31
+ Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
32
+
33
+ Raises
34
+ ------
35
+ ValueError
36
+ If both gRPC and HTTP services are empty for any endpoint.
37
+
38
+ Config
39
+ ------
40
+ extra : str
41
+ Pydantic config option to forbid extra fields.
42
+ """
43
+
44
+ auth_token: Optional[str] = None
45
+
46
+ yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
47
+ yolox_infer_protocol: str = ""
48
+
49
+ @model_validator(mode="before")
50
+ @classmethod
51
+ def validate_endpoints(cls, values):
52
+ """
53
+ Validates the gRPC and HTTP services for all endpoints.
54
+
55
+ Parameters
56
+ ----------
57
+ values : dict
58
+ Dictionary containing the values of the attributes for the class.
59
+
60
+ Returns
61
+ -------
62
+ dict
63
+ The validated dictionary of values.
64
+
65
+ Raises
66
+ ------
67
+ ValueError
68
+ If both gRPC and HTTP services are empty for any endpoint.
69
+ """
70
+
71
+ def clean_service(service):
72
+ """Set service to None if it's an empty string or contains only spaces or quotes."""
73
+ if service is None or not service.strip() or service.strip(" \"'") == "":
74
+ return None
75
+ return service
76
+
77
+ for model_name in ["yolox"]:
78
+ endpoint_name = f"{model_name}_endpoints"
79
+ grpc_service, http_service = values.get(endpoint_name)
80
+ grpc_service = clean_service(grpc_service)
81
+ http_service = clean_service(http_service)
82
+
83
+ if not grpc_service and not http_service:
84
+ raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
85
+
86
+ values[endpoint_name] = (grpc_service, http_service)
87
+
88
+ protocol_name = f"{model_name}_infer_protocol"
89
+ protocol_value = values.get(protocol_name)
90
+ if not protocol_value:
91
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
+ protocol_value = protocol_value.lower()
93
+ values[protocol_name] = protocol_value
94
+
95
+ return values
96
+
97
+ model_config = ConfigDict(extra="forbid")
98
+
99
+
100
+ class PPTXExtractorSchema(BaseModel):
101
+ """
102
+ Configuration schema for the PDF extractor settings.
103
+
104
+ Parameters
105
+ ----------
106
+ max_queue_size : int, default=1
107
+ The maximum number of items allowed in the processing queue.
108
+
109
+ n_workers : int, default=16
110
+ The number of worker threads to use for processing.
111
+
112
+ raise_on_failure : bool, default=False
113
+ A flag indicating whether to raise an exception on processing failure.
114
+
115
+ image_extraction_config: Optional[ImageConfigSchema], default=None
116
+ Configuration schema for the image extraction stage.
117
+ """
118
+
119
+ max_queue_size: int = 1
120
+ n_workers: int = 16
121
+ raise_on_failure: bool = False
122
+
123
+ pptx_extraction_config: Optional[PPTXConfigSchema] = None
124
+ model_config = ConfigDict(extra="forbid")