nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from typing import Tuple
|
|
8
|
+
|
|
9
|
+
from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OCRExtractorConfigSchema(LowercaseProtocolMixin):
|
|
17
|
+
"""
|
|
18
|
+
Configuration schema for text extraction service endpoints and options.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
auth_token : Optional[str], default=None
|
|
23
|
+
Authentication token required for secure services.
|
|
24
|
+
|
|
25
|
+
ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
26
|
+
A tuple containing the gRPC and HTTP services for the ocr endpoint.
|
|
27
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
28
|
+
|
|
29
|
+
Methods
|
|
30
|
+
-------
|
|
31
|
+
validate_endpoints(values)
|
|
32
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
33
|
+
|
|
34
|
+
Raises
|
|
35
|
+
------
|
|
36
|
+
ValueError
|
|
37
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
38
|
+
|
|
39
|
+
Config
|
|
40
|
+
------
|
|
41
|
+
extra : str
|
|
42
|
+
Pydantic config option to forbid extra fields.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
46
|
+
|
|
47
|
+
ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
48
|
+
ocr_infer_protocol: str = ""
|
|
49
|
+
|
|
50
|
+
nim_batch_size: int = 2
|
|
51
|
+
workers_per_progress_engine: int = 5
|
|
52
|
+
|
|
53
|
+
@model_validator(mode="before")
|
|
54
|
+
@classmethod
|
|
55
|
+
def validate_endpoints(cls, values):
|
|
56
|
+
"""
|
|
57
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
58
|
+
|
|
59
|
+
Ensures that at least one service (either gRPC or HTTP) is provided
|
|
60
|
+
for each endpoint in the configuration.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
values : dict
|
|
65
|
+
Dictionary containing the values of the attributes for the class.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
dict
|
|
70
|
+
The validated dictionary of values.
|
|
71
|
+
|
|
72
|
+
Raises
|
|
73
|
+
------
|
|
74
|
+
ValueError
|
|
75
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def clean_service(service):
|
|
79
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
80
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
81
|
+
return None
|
|
82
|
+
return service
|
|
83
|
+
|
|
84
|
+
for endpoint_name in ["ocr_endpoints"]:
|
|
85
|
+
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
86
|
+
grpc_service = clean_service(grpc_service)
|
|
87
|
+
http_service = clean_service(http_service)
|
|
88
|
+
|
|
89
|
+
if not grpc_service and not http_service:
|
|
90
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
91
|
+
|
|
92
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
93
|
+
|
|
94
|
+
# Auto-infer protocol from endpoints if not specified
|
|
95
|
+
protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
|
|
96
|
+
protocol_value = values.get(protocol_name)
|
|
97
|
+
if not protocol_value:
|
|
98
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
99
|
+
values[protocol_name] = protocol_value
|
|
100
|
+
|
|
101
|
+
return values
|
|
102
|
+
|
|
103
|
+
model_config = ConfigDict(extra="forbid")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class OCRExtractorSchema(BaseModel):
|
|
107
|
+
"""
|
|
108
|
+
Configuration schema for text extraction processing settings.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
max_queue_size : int, default=1
|
|
113
|
+
The maximum number of items allowed in the processing queue.
|
|
114
|
+
|
|
115
|
+
n_workers : int, default=2
|
|
116
|
+
The number of worker threads to use for processing.
|
|
117
|
+
|
|
118
|
+
raise_on_failure : bool, default=False
|
|
119
|
+
A flag indicating whether to raise an exception if a failure occurs during text extraction.
|
|
120
|
+
|
|
121
|
+
stage_config : Optional[OCRExtractorConfigSchema], default=None
|
|
122
|
+
Configuration for the text extraction stage, including yolox and ocr service endpoints.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
max_queue_size: int = 1
|
|
126
|
+
n_workers: int = 2
|
|
127
|
+
raise_on_failure: bool = False
|
|
128
|
+
|
|
129
|
+
endpoint_config: Optional[OCRExtractorConfigSchema] = None
|
|
130
|
+
|
|
131
|
+
@field_validator("max_queue_size", "n_workers")
|
|
132
|
+
def check_positive(cls, v, field):
|
|
133
|
+
if v <= 0:
|
|
134
|
+
raise ValueError(f"{field.field_name} must be greater than 0.")
|
|
135
|
+
return v
|
|
136
|
+
|
|
137
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import model_validator, ConfigDict, BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PDFiumConfigSchema(LowercaseProtocolMixin):
|
|
18
|
+
"""
|
|
19
|
+
Configuration schema for PDFium endpoints and options.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
auth_token : Optional[str], default=None
|
|
24
|
+
Authentication token required for secure services.
|
|
25
|
+
|
|
26
|
+
yolox_endpoints : Tuple[str, str]
|
|
27
|
+
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
28
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
29
|
+
|
|
30
|
+
Methods
|
|
31
|
+
-------
|
|
32
|
+
validate_endpoints(values)
|
|
33
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
ValueError
|
|
38
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
39
|
+
|
|
40
|
+
Config
|
|
41
|
+
------
|
|
42
|
+
extra : str
|
|
43
|
+
Pydantic config option to forbid extra fields.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
47
|
+
|
|
48
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
49
|
+
yolox_infer_protocol: str = ""
|
|
50
|
+
|
|
51
|
+
nim_batch_size: int = 4
|
|
52
|
+
workers_per_progress_engine: int = 5
|
|
53
|
+
|
|
54
|
+
@model_validator(mode="before")
|
|
55
|
+
@classmethod
|
|
56
|
+
def validate_endpoints(cls, values):
|
|
57
|
+
"""
|
|
58
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
values : dict
|
|
63
|
+
Dictionary containing the values of the attributes for the class.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
dict
|
|
68
|
+
The validated dictionary of values.
|
|
69
|
+
|
|
70
|
+
Raises
|
|
71
|
+
------
|
|
72
|
+
ValueError
|
|
73
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
for model_name in ["yolox"]:
|
|
77
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
78
|
+
grpc_service, http_service = values.get(endpoint_name, ("", ""))
|
|
79
|
+
grpc_service = _clean_service(grpc_service)
|
|
80
|
+
http_service = _clean_service(http_service)
|
|
81
|
+
|
|
82
|
+
if not grpc_service and not http_service:
|
|
83
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
84
|
+
|
|
85
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
86
|
+
|
|
87
|
+
# Auto-infer protocol from endpoints if not specified
|
|
88
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
89
|
+
protocol_value = values.get(protocol_name)
|
|
90
|
+
if not protocol_value:
|
|
91
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
92
|
+
values[protocol_name] = protocol_value
|
|
93
|
+
|
|
94
|
+
return values
|
|
95
|
+
|
|
96
|
+
model_config = ConfigDict(extra="forbid")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class NemotronParseConfigSchema(LowercaseProtocolMixin):
|
|
100
|
+
"""
|
|
101
|
+
Configuration schema for Nemotron Parse endpoints and options.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
auth_token : Optional[str], default=None
|
|
106
|
+
Authentication token required for secure services.
|
|
107
|
+
|
|
108
|
+
nemotron_parse_endpoints : Tuple[str, str]
|
|
109
|
+
A tuple containing the gRPC and HTTP services for the nemotron_parse endpoint.
|
|
110
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
111
|
+
|
|
112
|
+
Methods
|
|
113
|
+
-------
|
|
114
|
+
validate_endpoints(values)
|
|
115
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
116
|
+
|
|
117
|
+
Raises
|
|
118
|
+
------
|
|
119
|
+
ValueError
|
|
120
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
121
|
+
|
|
122
|
+
Config
|
|
123
|
+
------
|
|
124
|
+
extra : str
|
|
125
|
+
Pydantic config option to forbid extra fields.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
129
|
+
|
|
130
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
131
|
+
yolox_infer_protocol: str = ""
|
|
132
|
+
|
|
133
|
+
nemotron_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
134
|
+
nemotron_parse_infer_protocol: str = ""
|
|
135
|
+
|
|
136
|
+
nemotron_parse_model_name: str = "nvidia/nemotron-parse"
|
|
137
|
+
|
|
138
|
+
timeout: float = 300.0
|
|
139
|
+
|
|
140
|
+
workers_per_progress_engine: int = 5
|
|
141
|
+
|
|
142
|
+
@model_validator(mode="before")
|
|
143
|
+
@classmethod
|
|
144
|
+
def validate_endpoints(cls, values):
|
|
145
|
+
"""
|
|
146
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
values : dict
|
|
151
|
+
Dictionary containing the values of the attributes for the class.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
dict
|
|
156
|
+
The validated dictionary of values.
|
|
157
|
+
|
|
158
|
+
Raises
|
|
159
|
+
------
|
|
160
|
+
ValueError
|
|
161
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
for model_name in ["nemotron_parse"]:
|
|
165
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
166
|
+
grpc_service, http_service = values.get(endpoint_name, ("", ""))
|
|
167
|
+
grpc_service = _clean_service(grpc_service)
|
|
168
|
+
http_service = _clean_service(http_service)
|
|
169
|
+
|
|
170
|
+
if not grpc_service and not http_service:
|
|
171
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
172
|
+
|
|
173
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
174
|
+
|
|
175
|
+
# Auto-infer protocol from endpoints if not specified
|
|
176
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
177
|
+
protocol_value = values.get(protocol_name)
|
|
178
|
+
if not protocol_value:
|
|
179
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
180
|
+
values[protocol_name] = protocol_value
|
|
181
|
+
|
|
182
|
+
return values
|
|
183
|
+
|
|
184
|
+
model_config = ConfigDict(extra="forbid")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class PDFExtractorSchema(BaseModel):
|
|
188
|
+
"""
|
|
189
|
+
Configuration schema for the PDF extractor settings.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
max_queue_size : int, default=1
|
|
194
|
+
The maximum number of items allowed in the processing queue.
|
|
195
|
+
|
|
196
|
+
n_workers : int, default=16
|
|
197
|
+
The number of worker threads to use for processing.
|
|
198
|
+
|
|
199
|
+
raise_on_failure : bool, default=False
|
|
200
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
201
|
+
|
|
202
|
+
pdfium_config : Optional[PDFiumConfigSchema], default=None
|
|
203
|
+
Configuration for the PDFium service endpoints.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
max_queue_size: int = 1
|
|
207
|
+
n_workers: int = 16
|
|
208
|
+
raise_on_failure: bool = False
|
|
209
|
+
|
|
210
|
+
pdfium_config: Optional[PDFiumConfigSchema] = None
|
|
211
|
+
nemotron_parse_config: Optional[NemotronParseConfigSchema] = None
|
|
212
|
+
|
|
213
|
+
model_config = ConfigDict(extra="forbid")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _clean_service(service):
|
|
217
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
218
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
219
|
+
return None
|
|
220
|
+
return service
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import model_validator, ConfigDict, BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema
|
|
13
|
+
from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PPTXConfigSchema(LowercaseProtocolMixin):
|
|
19
|
+
"""
|
|
20
|
+
Configuration schema for docx extraction endpoints and options.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
auth_token : Optional[str], default=None
|
|
25
|
+
Authentication token required for secure services.
|
|
26
|
+
|
|
27
|
+
yolox_endpoints : Tuple[str, str]
|
|
28
|
+
A tuple containing the gRPC and HTTP services for the yolox endpoint.
|
|
29
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
30
|
+
|
|
31
|
+
Methods
|
|
32
|
+
-------
|
|
33
|
+
validate_endpoints(values)
|
|
34
|
+
Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
|
|
35
|
+
|
|
36
|
+
Raises
|
|
37
|
+
------
|
|
38
|
+
ValueError
|
|
39
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
40
|
+
|
|
41
|
+
Config
|
|
42
|
+
------
|
|
43
|
+
extra : str
|
|
44
|
+
Pydantic config option to forbid extra fields.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
48
|
+
|
|
49
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
50
|
+
yolox_infer_protocol: str = ""
|
|
51
|
+
|
|
52
|
+
@model_validator(mode="before")
|
|
53
|
+
@classmethod
|
|
54
|
+
def validate_endpoints(cls, values):
|
|
55
|
+
"""
|
|
56
|
+
Validates the gRPC and HTTP services for all endpoints.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
values : dict
|
|
61
|
+
Dictionary containing the values of the attributes for the class.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict
|
|
66
|
+
The validated dictionary of values.
|
|
67
|
+
|
|
68
|
+
Raises
|
|
69
|
+
------
|
|
70
|
+
ValueError
|
|
71
|
+
If both gRPC and HTTP services are empty for any endpoint.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def clean_service(service):
|
|
75
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
76
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
77
|
+
return None
|
|
78
|
+
return service
|
|
79
|
+
|
|
80
|
+
for model_name in ["yolox"]:
|
|
81
|
+
endpoint_name = f"{model_name}_endpoints"
|
|
82
|
+
grpc_service, http_service = values.get(endpoint_name)
|
|
83
|
+
grpc_service = clean_service(grpc_service)
|
|
84
|
+
http_service = clean_service(http_service)
|
|
85
|
+
|
|
86
|
+
if not grpc_service and not http_service:
|
|
87
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
88
|
+
|
|
89
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
90
|
+
|
|
91
|
+
# Auto-infer protocol from endpoints if not specified
|
|
92
|
+
protocol_name = f"{model_name}_infer_protocol"
|
|
93
|
+
protocol_value = values.get(protocol_name)
|
|
94
|
+
if not protocol_value:
|
|
95
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
96
|
+
values[protocol_name] = protocol_value
|
|
97
|
+
|
|
98
|
+
return values
|
|
99
|
+
|
|
100
|
+
model_config = ConfigDict(extra="forbid")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class PPTXExtractorSchema(BaseModel):
|
|
104
|
+
"""
|
|
105
|
+
Configuration schema for the PDF extractor settings.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
max_queue_size : int, default=1
|
|
110
|
+
The maximum number of items allowed in the processing queue.
|
|
111
|
+
|
|
112
|
+
n_workers : int, default=16
|
|
113
|
+
The number of worker threads to use for processing.
|
|
114
|
+
|
|
115
|
+
raise_on_failure : bool, default=False
|
|
116
|
+
A flag indicating whether to raise an exception on processing failure.
|
|
117
|
+
|
|
118
|
+
image_extraction_config: Optional[ImageConfigSchema], default=None
|
|
119
|
+
Configuration schema for the image extraction stage.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
max_queue_size: int = 1
|
|
123
|
+
n_workers: int = 16
|
|
124
|
+
raise_on_failure: bool = False
|
|
125
|
+
|
|
126
|
+
pptx_extraction_config: Optional[PPTXConfigSchema] = None
|
|
127
|
+
pdfium_config: Optional[PDFiumConfigSchema] = None
|
|
128
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TableExtractorConfigSchema(LowercaseProtocolMixin):
|
|
18
|
+
"""
|
|
19
|
+
Configuration schema for the table extraction stage settings.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
auth_token : Optional[str], default=None
|
|
24
|
+
Authentication token required for secure services.
|
|
25
|
+
|
|
26
|
+
ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
27
|
+
A tuple containing the gRPC and HTTP services for the ocr endpoint.
|
|
28
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
29
|
+
|
|
30
|
+
Methods
|
|
31
|
+
-------
|
|
32
|
+
validate_endpoints(values)
|
|
33
|
+
Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
ValueError
|
|
38
|
+
If both gRPC and HTTP services are empty for the yolox endpoint.
|
|
39
|
+
|
|
40
|
+
Config
|
|
41
|
+
------
|
|
42
|
+
extra : str
|
|
43
|
+
Pydantic config option to forbid extra fields.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
47
|
+
|
|
48
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
49
|
+
yolox_infer_protocol: str = ""
|
|
50
|
+
|
|
51
|
+
ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
52
|
+
ocr_infer_protocol: str = ""
|
|
53
|
+
|
|
54
|
+
nim_batch_size: int = 2
|
|
55
|
+
workers_per_progress_engine: int = 5
|
|
56
|
+
|
|
57
|
+
@model_validator(mode="before")
|
|
58
|
+
@classmethod
|
|
59
|
+
def validate_endpoints(cls, values):
|
|
60
|
+
"""
|
|
61
|
+
Validates the gRPC and HTTP services for the yolox endpoint.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
values : dict
|
|
66
|
+
Dictionary containing the values of the attributes for the class.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
dict
|
|
71
|
+
The validated dictionary of values.
|
|
72
|
+
|
|
73
|
+
Raises
|
|
74
|
+
------
|
|
75
|
+
ValueError
|
|
76
|
+
If both gRPC and HTTP services are empty for the yolox endpoint.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def clean_service(service):
|
|
80
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
81
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
82
|
+
return None
|
|
83
|
+
return service
|
|
84
|
+
|
|
85
|
+
for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
|
|
86
|
+
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
87
|
+
grpc_service = clean_service(grpc_service)
|
|
88
|
+
http_service = clean_service(http_service)
|
|
89
|
+
|
|
90
|
+
if not grpc_service and not http_service:
|
|
91
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
92
|
+
|
|
93
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
94
|
+
|
|
95
|
+
# Auto-infer protocol from endpoints if not specified
|
|
96
|
+
protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
|
|
97
|
+
protocol_value = values.get(protocol_name)
|
|
98
|
+
if not protocol_value:
|
|
99
|
+
protocol_value = "http" if http_service else "grpc" if grpc_service else ""
|
|
100
|
+
values[protocol_name] = protocol_value
|
|
101
|
+
|
|
102
|
+
return values
|
|
103
|
+
|
|
104
|
+
model_config = ConfigDict(extra="forbid")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TableExtractorSchema(BaseModel):
|
|
108
|
+
"""
|
|
109
|
+
Configuration schema for the table extraction processing settings.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
max_queue_size : int, default=1
|
|
114
|
+
The maximum number of items allowed in the processing queue.
|
|
115
|
+
|
|
116
|
+
n_workers : int, default=2
|
|
117
|
+
The number of worker threads to use for processing.
|
|
118
|
+
|
|
119
|
+
raise_on_failure : bool, default=False
|
|
120
|
+
A flag indicating whether to raise an exception if a failure occurs during table extraction.
|
|
121
|
+
|
|
122
|
+
stage_config : Optional[TableExtractorConfigSchema], default=None
|
|
123
|
+
Configuration for the table extraction stage, including yolox service endpoints.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
max_queue_size: int = 1
|
|
127
|
+
n_workers: int = 2
|
|
128
|
+
raise_on_failure: bool = False
|
|
129
|
+
|
|
130
|
+
@field_validator("max_queue_size", "n_workers")
|
|
131
|
+
def check_positive(cls, v, field):
|
|
132
|
+
if v <= 0:
|
|
133
|
+
raise ValueError(f"{field.field_name} must be greater than 0.")
|
|
134
|
+
return v
|
|
135
|
+
|
|
136
|
+
endpoint_config: Optional[TableExtractorConfigSchema] = None
|
|
137
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from typing import Optional, Literal, Annotated
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MessageBrokerClientSchema(BaseModel):
|
|
10
|
+
"""
|
|
11
|
+
Configuration schema for message broker client connections.
|
|
12
|
+
Supports Redis or simple in-memory clients.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
host: str = Field(default="redis", description="Hostname of the broker service.")
|
|
16
|
+
|
|
17
|
+
port: Annotated[int, Field(gt=0, lt=65536)] = Field(
|
|
18
|
+
default=6379, description="Port to connect to. Must be between 1 and 65535."
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
client_type: Literal["redis", "simple"] = Field(
|
|
22
|
+
default="redis", description="Type of broker client. Supported values: 'redis', 'simple'."
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
broker_params: Optional[dict] = Field(
|
|
26
|
+
default_factory=dict, description="Optional parameters passed to the broker client."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
connection_timeout: Annotated[int, Field(ge=0)] = Field(
|
|
30
|
+
default=300, description="Connection timeout in seconds. Must be >= 0."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
max_backoff: Annotated[int, Field(ge=0)] = Field(
|
|
34
|
+
default=300, description="Maximum backoff time in seconds. Must be >= 0."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
max_retries: Annotated[int, Field(ge=0)] = Field(default=0, description="Maximum number of retries. Must be >= 0.")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import ConfigDict, BaseModel
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Define schemas for request validation
|
|
16
|
+
class PushRequestSchema(BaseModel):
|
|
17
|
+
command: str
|
|
18
|
+
queue_name: str = Field(..., min_length=1)
|
|
19
|
+
message: str = Field(..., min_length=1)
|
|
20
|
+
timeout: Optional[float] = 100 # Optional timeout for blocking push
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PopRequestSchema(BaseModel):
|
|
25
|
+
command: str
|
|
26
|
+
queue_name: str = Field(..., min_length=1)
|
|
27
|
+
timeout: Optional[float] = 100 # Optional timeout for blocking pop
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SizeRequestSchema(BaseModel):
|
|
32
|
+
command: str
|
|
33
|
+
queue_name: str = Field(..., min_length=1)
|
|
34
|
+
model_config = ConfigDict(extra="forbid")
|