nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import model_validator, ConfigDict, BaseModel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ImageConfigSchema(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration schema for image extraction endpoints and options.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
auth_token : Optional[str], default=None
|
|
22
|
+
Authentication token required for secure services.
|
|
23
|
+
|
|
24
|
+
yolox_endpoints : Tuple[str, str]
|
|
25
|
+
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
26
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
27
|
+
|
|
28
|
+
Methods
|
|
29
|
+
-------
|
|
30
|
+
validate_endpoints(values)
|
|
31
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
ValueError
|
|
36
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
37
|
+
|
|
38
|
+
Config
|
|
39
|
+
------
|
|
40
|
+
extra : str
|
|
41
|
+
Pydantic config option to forbid extra fields.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
auth_token: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
47
|
+
yolox_infer_protocol: str = ""
|
|
48
|
+
|
|
49
|
+
@model_validator(mode="before")
|
|
50
|
+
@classmethod
|
|
51
|
+
def validate_endpoints(cls, values):
|
|
52
|
+
"""
|
|
53
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
values : dict
|
|
58
|
+
Dictionary containing the values of the attributes for the class.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
dict
|
|
63
|
+
The validated dictionary of values.
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
ValueError
|
|
68
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def clean_service(service):
|
|
72
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
73
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
74
|
+
return None
|
|
75
|
+
return service
|
|
76
|
+
|
|
77
|
+
for model_name in ["yolox"]:
|
|
78
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
79
|
+
grpc_service, http_service = values.get(endpoint_name)
|
|
80
|
+
grpc_service = clean_service(grpc_service)
|
|
81
|
+
http_service = clean_service(http_service)
|
|
82
|
+
|
|
83
|
+
if not grpc_service and not http_service:
|
|
84
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
85
|
+
|
|
86
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
87
|
+
|
|
88
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
89
|
+
protocol_value = values.get(protocol_name)
|
|
90
|
+
if not protocol_value:
|
|
91
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
92
|
+
protocol_value = protocol_value.lower()
|
|
93
|
+
values[protocol_name] = protocol_value
|
|
94
|
+
|
|
95
|
+
return values
|
|
96
|
+
|
|
97
|
+
model_config = ConfigDict(extra="forbid")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ImageExtractorSchema(BaseModel):
|
|
101
|
+
"""
|
|
102
|
+
Configuration schema for the PDF extractor settings.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
max_queue_size : int, default=1
|
|
107
|
+
The maximum number of items allowed in the processing queue.
|
|
108
|
+
|
|
109
|
+
n_workers : int, default=16
|
|
110
|
+
The number of worker threads to use for processing.
|
|
111
|
+
|
|
112
|
+
raise_on_failure : bool, default=False
|
|
113
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
114
|
+
|
|
115
|
+
image_extraction_config: Optional[ImageConfigSchema], default=None
|
|
116
|
+
Configuration schema for the image extraction stage.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
max_queue_size: int = 1
|
|
120
|
+
n_workers: int = 16
|
|
121
|
+
raise_on_failure: bool = False
|
|
122
|
+
|
|
123
|
+
image_extraction_config: Optional[ImageConfigSchema] = None
|
|
124
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from typing import Tuple
|
|
8
|
+
|
|
9
|
+
from pydantic import field_validator, model_validator, ConfigDict, BaseModel
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class InfographicExtractorConfigSchema(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Configuration schema for infographic extraction service endpoints and options.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
auth_token : Optional[str], default=None
|
|
21
|
+
Authentication token required for secure services.
|
|
22
|
+
|
|
23
|
+
paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
24
|
+
A tuple containing the gRPC and HTTP services for the paddle endpoint.
|
|
25
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
26
|
+
|
|
27
|
+
Methods
|
|
28
|
+
-------
|
|
29
|
+
validate_endpoints(values)
|
|
30
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
31
|
+
|
|
32
|
+
Raises
|
|
33
|
+
------
|
|
34
|
+
ValueError
|
|
35
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
36
|
+
|
|
37
|
+
Config
|
|
38
|
+
------
|
|
39
|
+
extra : str
|
|
40
|
+
Pydantic config option to forbid extra fields.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
auth_token: Optional[str] = None
|
|
44
|
+
|
|
45
|
+
paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
46
|
+
paddle_infer_protocol: str = ""
|
|
47
|
+
|
|
48
|
+
nim_batch_size: int = 2
|
|
49
|
+
workers_per_progress_engine: int = 5
|
|
50
|
+
|
|
51
|
+
@model_validator(mode="before")
|
|
52
|
+
@classmethod
|
|
53
|
+
def validate_endpoints(cls, values):
|
|
54
|
+
"""
|
|
55
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
56
|
+
|
|
57
|
+
Ensures that at least one service (either gRPC or HTTP) is provided
|
|
58
|
+
for each endpoint in the configuration.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
values : dict
|
|
63
|
+
Dictionary containing the values of the attributes for the class.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
dict
|
|
68
|
+
The validated dictionary of values.
|
|
69
|
+
|
|
70
|
+
Raises
|
|
71
|
+
------
|
|
72
|
+
ValueError
|
|
73
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def clean_service(service):
|
|
77
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
78
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
79
|
+
return None
|
|
80
|
+
return service
|
|
81
|
+
|
|
82
|
+
for endpoint_name in ["paddle_endpoints"]:
|
|
83
|
+
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
84
|
+
grpc_service = clean_service(grpc_service)
|
|
85
|
+
http_service = clean_service(http_service)
|
|
86
|
+
|
|
87
|
+
if not grpc_service and not http_service:
|
|
88
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
89
|
+
|
|
90
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
91
|
+
|
|
92
|
+
return values
|
|
93
|
+
|
|
94
|
+
model_config = ConfigDict(extra="forbid")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class InfographicExtractorSchema(BaseModel):
|
|
98
|
+
"""
|
|
99
|
+
Configuration schema for infographic extraction processing settings.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
max_queue_size : int, default=1
|
|
104
|
+
The maximum number of items allowed in the processing queue.
|
|
105
|
+
|
|
106
|
+
n_workers : int, default=2
|
|
107
|
+
The number of worker threads to use for processing.
|
|
108
|
+
|
|
109
|
+
raise_on_failure : bool, default=False
|
|
110
|
+
A flag indicating whether to raise an exception if a failure occurs during infographic extraction.
|
|
111
|
+
|
|
112
|
+
stage_config : Optional[InfographicExtractorConfigSchema], default=None
|
|
113
|
+
Configuration for the infographic extraction stage, including yolox and paddle service endpoints.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
max_queue_size: int = 1
|
|
117
|
+
n_workers: int = 2
|
|
118
|
+
raise_on_failure: bool = False
|
|
119
|
+
|
|
120
|
+
endpoint_config: Optional[InfographicExtractorConfigSchema] = None
|
|
121
|
+
|
|
122
|
+
@field_validator("max_queue_size", "n_workers")
|
|
123
|
+
def check_positive(cls, v, field):
|
|
124
|
+
if v <= 0:
|
|
125
|
+
raise ValueError(f"{field.field_name} must be greater than 10.")
|
|
126
|
+
return v
|
|
127
|
+
|
|
128
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import model_validator, ConfigDict, BaseModel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PDFiumConfigSchema(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration schema for PDFium endpoints and options.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
auth_token : Optional[str], default=None
|
|
22
|
+
Authentication token required for secure services.
|
|
23
|
+
|
|
24
|
+
yolox_endpoints : Tuple[str, str]
|
|
25
|
+
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
26
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
27
|
+
|
|
28
|
+
Methods
|
|
29
|
+
-------
|
|
30
|
+
validate_endpoints(values)
|
|
31
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
ValueError
|
|
36
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
37
|
+
|
|
38
|
+
Config
|
|
39
|
+
------
|
|
40
|
+
extra : str
|
|
41
|
+
Pydantic config option to forbid extra fields.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
auth_token: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
47
|
+
yolox_infer_protocol: str = ""
|
|
48
|
+
|
|
49
|
+
nim_batch_size: int = 4
|
|
50
|
+
workers_per_progress_engine: int = 5
|
|
51
|
+
|
|
52
|
+
@model_validator(mode="before")
|
|
53
|
+
@classmethod
|
|
54
|
+
def validate_endpoints(cls, values):
|
|
55
|
+
"""
|
|
56
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
values : dict
|
|
61
|
+
Dictionary containing the values of the attributes for the class.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict
|
|
66
|
+
The validated dictionary of values.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
for model_name in ["yolox"]:
|
|
75
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
76
|
+
grpc_service, http_service = values.get(endpoint_name, ("", ""))
|
|
77
|
+
grpc_service = _clean_service(grpc_service)
|
|
78
|
+
http_service = _clean_service(http_service)
|
|
79
|
+
|
|
80
|
+
if not grpc_service and not http_service:
|
|
81
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
82
|
+
|
|
83
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
84
|
+
|
|
85
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
86
|
+
protocol_value = values.get(protocol_name)
|
|
87
|
+
if not protocol_value:
|
|
88
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
89
|
+
protocol_value = protocol_value.lower()
|
|
90
|
+
values[protocol_name] = protocol_value
|
|
91
|
+
|
|
92
|
+
return values
|
|
93
|
+
|
|
94
|
+
model_config = ConfigDict(extra="forbid")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class NemoRetrieverParseConfigSchema(BaseModel):
|
|
98
|
+
"""
|
|
99
|
+
Configuration schema for NemoRetrieverParse endpoints and options.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
auth_token : Optional[str], default=None
|
|
104
|
+
Authentication token required for secure services.
|
|
105
|
+
|
|
106
|
+
nemoretriever_parse_endpoints : Tuple[str, str]
|
|
107
|
+
A tuple containing the gRPC and HTTP services for the nemoretriever_parse endpoint.
|
|
108
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
109
|
+
|
|
110
|
+
Methods
|
|
111
|
+
-------
|
|
112
|
+
validate_endpoints(values)
|
|
113
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
114
|
+
|
|
115
|
+
Raises
|
|
116
|
+
------
|
|
117
|
+
ValueError
|
|
118
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
119
|
+
|
|
120
|
+
Config
|
|
121
|
+
------
|
|
122
|
+
extra : str
|
|
123
|
+
Pydantic config option to forbid extra fields.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
auth_token: Optional[str] = None
|
|
127
|
+
|
|
128
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
129
|
+
yolox_infer_protocol: str = ""
|
|
130
|
+
|
|
131
|
+
nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
132
|
+
nemoretriever_parse_infer_protocol: str = ""
|
|
133
|
+
|
|
134
|
+
model_name: str = "nvidia/nemoretriever-parse"
|
|
135
|
+
|
|
136
|
+
timeout: float = 300.0
|
|
137
|
+
|
|
138
|
+
workers_per_progress_engine: int = 5
|
|
139
|
+
|
|
140
|
+
@model_validator(mode="before")
|
|
141
|
+
@classmethod
|
|
142
|
+
def validate_endpoints(cls, values):
|
|
143
|
+
"""
|
|
144
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
values : dict
|
|
149
|
+
Dictionary containing the values of the attributes for the class.
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
dict
|
|
154
|
+
The validated dictionary of values.
|
|
155
|
+
|
|
156
|
+
Raises
|
|
157
|
+
------
|
|
158
|
+
ValueError
|
|
159
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
for model_name in ["nemoretriever_parse"]:
|
|
163
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
164
|
+
grpc_service, http_service = values.get(endpoint_name, ("", ""))
|
|
165
|
+
grpc_service = _clean_service(grpc_service)
|
|
166
|
+
http_service = _clean_service(http_service)
|
|
167
|
+
|
|
168
|
+
if not grpc_service and not http_service:
|
|
169
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
170
|
+
|
|
171
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
172
|
+
|
|
173
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
174
|
+
protocol_value = values.get(protocol_name)
|
|
175
|
+
if not protocol_value:
|
|
176
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
177
|
+
protocol_value = protocol_value.lower()
|
|
178
|
+
values[protocol_name] = protocol_value
|
|
179
|
+
|
|
180
|
+
return values
|
|
181
|
+
|
|
182
|
+
model_config = ConfigDict(extra="forbid")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class PDFExtractorSchema(BaseModel):
|
|
186
|
+
"""
|
|
187
|
+
Configuration schema for the PDF extractor settings.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
max_queue_size : int, default=1
|
|
192
|
+
The maximum number of items allowed in the processing queue.
|
|
193
|
+
|
|
194
|
+
n_workers : int, default=16
|
|
195
|
+
The number of worker threads to use for processing.
|
|
196
|
+
|
|
197
|
+
raise_on_failure : bool, default=False
|
|
198
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
199
|
+
|
|
200
|
+
pdfium_config : Optional[PDFiumConfigSchema], default=None
|
|
201
|
+
Configuration for the PDFium service endpoints.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
max_queue_size: int = 1
|
|
205
|
+
n_workers: int = 16
|
|
206
|
+
raise_on_failure: bool = False
|
|
207
|
+
|
|
208
|
+
pdfium_config: Optional[PDFiumConfigSchema] = None
|
|
209
|
+
nemoretriever_parse_config: Optional[NemoRetrieverParseConfigSchema] = None
|
|
210
|
+
|
|
211
|
+
model_config = ConfigDict(extra="forbid")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _clean_service(service):
|
|
215
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
216
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
217
|
+
return None
|
|
218
|
+
return service
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import model_validator, ConfigDict, BaseModel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PPTXConfigSchema(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration schema for docx extraction endpoints and options.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
auth_token : Optional[str], default=None
|
|
22
|
+
Authentication token required for secure services.
|
|
23
|
+
|
|
24
|
+
yolox_endpoints : Tuple[str, str]
|
|
25
|
+
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
26
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
27
|
+
|
|
28
|
+
Methods
|
|
29
|
+
-------
|
|
30
|
+
validate_endpoints(values)
|
|
31
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
ValueError
|
|
36
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
37
|
+
|
|
38
|
+
Config
|
|
39
|
+
------
|
|
40
|
+
extra : str
|
|
41
|
+
Pydantic config option to forbid extra fields.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
auth_token: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
47
|
+
yolox_infer_protocol: str = ""
|
|
48
|
+
|
|
49
|
+
@model_validator(mode="before")
|
|
50
|
+
@classmethod
|
|
51
|
+
def validate_endpoints(cls, values):
|
|
52
|
+
"""
|
|
53
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
values : dict
|
|
58
|
+
Dictionary containing the values of the attributes for the class.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
dict
|
|
63
|
+
The validated dictionary of values.
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
ValueError
|
|
68
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def clean_service(service):
|
|
72
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
73
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
74
|
+
return None
|
|
75
|
+
return service
|
|
76
|
+
|
|
77
|
+
for model_name in ["yolox"]:
|
|
78
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
79
|
+
grpc_service, http_service = values.get(endpoint_name)
|
|
80
|
+
grpc_service = clean_service(grpc_service)
|
|
81
|
+
http_service = clean_service(http_service)
|
|
82
|
+
|
|
83
|
+
if not grpc_service and not http_service:
|
|
84
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
85
|
+
|
|
86
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
87
|
+
|
|
88
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
89
|
+
protocol_value = values.get(protocol_name)
|
|
90
|
+
if not protocol_value:
|
|
91
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
92
|
+
protocol_value = protocol_value.lower()
|
|
93
|
+
values[protocol_name] = protocol_value
|
|
94
|
+
|
|
95
|
+
return values
|
|
96
|
+
|
|
97
|
+
model_config = ConfigDict(extra="forbid")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class PPTXExtractorSchema(BaseModel):
|
|
101
|
+
"""
|
|
102
|
+
Configuration schema for the PDF extractor settings.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
max_queue_size : int, default=1
|
|
107
|
+
The maximum number of items allowed in the processing queue.
|
|
108
|
+
|
|
109
|
+
n_workers : int, default=16
|
|
110
|
+
The number of worker threads to use for processing.
|
|
111
|
+
|
|
112
|
+
raise_on_failure : bool, default=False
|
|
113
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
114
|
+
|
|
115
|
+
image_extraction_config: Optional[ImageConfigSchema], default=None
|
|
116
|
+
Configuration schema for the image extraction stage.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
max_queue_size: int = 1
|
|
120
|
+
n_workers: int = 16
|
|
121
|
+
raise_on_failure: bool = False
|
|
122
|
+
|
|
123
|
+
pptx_extraction_config: Optional[PPTXConfigSchema] = None
|
|
124
|
+
model_config = ConfigDict(extra="forbid")
|