nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
from pydantic import field_validator, model_validator, ConfigDict, BaseModel
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TableExtractorConfigSchema(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
Configuration schema for the table extraction stage settings.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
auth_token : Optional[str], default=None
|
|
23
|
+
Authentication token required for secure services.
|
|
24
|
+
|
|
25
|
+
paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
26
|
+
A tuple containing the gRPC and HTTP services for the paddle endpoint.
|
|
27
|
+
Either the gRPC or HTTP service can be empty, but not both.
|
|
28
|
+
|
|
29
|
+
Methods
|
|
30
|
+
-------
|
|
31
|
+
validate_endpoints(values)
|
|
32
|
+
Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.
|
|
33
|
+
|
|
34
|
+
Raises
|
|
35
|
+
------
|
|
36
|
+
ValueError
|
|
37
|
+
If both gRPC and HTTP services are empty for the yolox endpoint.
|
|
38
|
+
|
|
39
|
+
Config
|
|
40
|
+
------
|
|
41
|
+
extra : str
|
|
42
|
+
Pydantic config option to forbid extra fields.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
auth_token: Optional[str] = None
|
|
46
|
+
|
|
47
|
+
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
48
|
+
yolox_infer_protocol: str = ""
|
|
49
|
+
|
|
50
|
+
paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
51
|
+
paddle_infer_protocol: str = ""
|
|
52
|
+
|
|
53
|
+
nim_batch_size: int = 2
|
|
54
|
+
workers_per_progress_engine: int = 5
|
|
55
|
+
|
|
56
|
+
@model_validator(mode="before")
|
|
57
|
+
@classmethod
|
|
58
|
+
def validate_endpoints(cls, values):
|
|
59
|
+
"""
|
|
60
|
+
Validates the gRPC and HTTP services for the yolox endpoint.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
values : dict
|
|
65
|
+
Dictionary containing the values of the attributes for the class.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
dict
|
|
70
|
+
The validated dictionary of values.
|
|
71
|
+
|
|
72
|
+
Raises
|
|
73
|
+
------
|
|
74
|
+
ValueError
|
|
75
|
+
If both gRPC and HTTP services are empty for the yolox endpoint.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def clean_service(service):
|
|
79
|
+
"""Set service to None if it's an empty string or contains only spaces or quotes."""
|
|
80
|
+
if service is None or not service.strip() or service.strip(" \"'") == "":
|
|
81
|
+
return None
|
|
82
|
+
return service
|
|
83
|
+
|
|
84
|
+
for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
|
|
85
|
+
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
86
|
+
grpc_service = clean_service(grpc_service)
|
|
87
|
+
http_service = clean_service(http_service)
|
|
88
|
+
|
|
89
|
+
if not grpc_service and not http_service:
|
|
90
|
+
raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
|
|
91
|
+
|
|
92
|
+
values[endpoint_name] = (grpc_service, http_service)
|
|
93
|
+
|
|
94
|
+
return values
|
|
95
|
+
|
|
96
|
+
model_config = ConfigDict(extra="forbid")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TableExtractorSchema(BaseModel):
|
|
100
|
+
"""
|
|
101
|
+
Configuration schema for the table extraction processing settings.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
max_queue_size : int, default=1
|
|
106
|
+
The maximum number of items allowed in the processing queue.
|
|
107
|
+
|
|
108
|
+
n_workers : int, default=2
|
|
109
|
+
The number of worker threads to use for processing.
|
|
110
|
+
|
|
111
|
+
raise_on_failure : bool, default=False
|
|
112
|
+
A flag indicating whether to raise an exception if a failure occurs during table extraction.
|
|
113
|
+
|
|
114
|
+
stage_config : Optional[TableExtractorConfigSchema], default=None
|
|
115
|
+
Configuration for the table extraction stage, including yolox service endpoints.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
max_queue_size: int = 1
|
|
119
|
+
n_workers: int = 2
|
|
120
|
+
raise_on_failure: bool = False
|
|
121
|
+
|
|
122
|
+
@field_validator("max_queue_size", "n_workers")
|
|
123
|
+
def check_positive(cls, v, field):
|
|
124
|
+
if v <= 0:
|
|
125
|
+
raise ValueError(f"{field.field_name} must be greater than 10.")
|
|
126
|
+
return v
|
|
127
|
+
|
|
128
|
+
endpoint_config: Optional[TableExtractorConfigSchema] = None
|
|
129
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from typing import Optional, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, BaseModel
|
|
9
|
+
from typing_extensions import Annotated
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MessageBrokerClientSchema(BaseModel):
|
|
13
|
+
host: str = "redis"
|
|
14
|
+
port: Annotated[int, Field(gt=0, lt=65536)] = 6379
|
|
15
|
+
|
|
16
|
+
# Update this for new broker types
|
|
17
|
+
client_type: Literal["redis", "simple"] = "redis" # Restrict to 'redis' or 'simple'
|
|
18
|
+
|
|
19
|
+
broker_params: Optional[dict] = Field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300
|
|
22
|
+
max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300
|
|
23
|
+
max_retries: Optional[Annotated[int, Field(ge=0)]] = 0
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import ConfigDict, BaseModel
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Define schemas for request validation
|
|
16
|
+
class PushRequestSchema(BaseModel):
|
|
17
|
+
command: str
|
|
18
|
+
queue_name: str = Field(..., min_length=1)
|
|
19
|
+
message: str = Field(..., min_length=1)
|
|
20
|
+
timeout: Optional[float] = 100 # Optional timeout for blocking push
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PopRequestSchema(BaseModel):
|
|
25
|
+
command: str
|
|
26
|
+
queue_name: str = Field(..., min_length=1)
|
|
27
|
+
timeout: Optional[float] = 100 # Optional timeout for blocking pop
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SizeRequestSchema(BaseModel):
|
|
32
|
+
command: str
|
|
33
|
+
queue_name: str = Field(..., min_length=1)
|
|
34
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# NOTE: This code is duplicated from the ingest service:
|
|
6
|
+
# src/nv_ingest_client/schemas/response_schema.py
|
|
7
|
+
# Eventually we should move all client wrappers for the message broker into a shared library that both the ingest
|
|
8
|
+
# service and the client can use.
|
|
9
|
+
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResponseSchema(BaseModel):
|
|
15
|
+
response_code: int
|
|
16
|
+
response_reason: Optional[str] = "OK"
|
|
17
|
+
response: Union[str, dict, None] = None
|
|
18
|
+
trace_id: Optional[str] = None # Unique trace ID
|
|
19
|
+
transaction_id: Optional[str] = None # Unique transaction ID
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Define a base class with extra fields forbidden
|
|
10
|
+
class BaseModelNoExt(BaseModel):
|
|
11
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union, Annotated
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, field_validator, model_validator
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
|
|
11
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum, TaskTypeEnum, DocumentTypeEnum
|
|
12
|
+
|
|
13
|
+
# ------------------------------------------------------------------------------
|
|
14
|
+
# Logging Configuration
|
|
15
|
+
# ------------------------------------------------------------------------------
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ------------------------------------------------------------------------------
|
|
20
|
+
# Schemas: Common and Task-Specific
|
|
21
|
+
# ------------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Tracing Options Schema
|
|
25
|
+
class TracingOptionsSchema(BaseModelNoExt):
|
|
26
|
+
trace: bool = False
|
|
27
|
+
ts_send: int
|
|
28
|
+
trace_id: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Ingest Task Schemas
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class IngestTaskSplitSchema(BaseModelNoExt):
|
|
35
|
+
tokenizer: Optional[str] = None
|
|
36
|
+
chunk_size: Annotated[int, Field(gt=0)] = 1024
|
|
37
|
+
chunk_overlap: Annotated[int, Field(ge=0)] = 150
|
|
38
|
+
params: dict
|
|
39
|
+
|
|
40
|
+
@field_validator("chunk_overlap")
|
|
41
|
+
def check_chunk_overlap(cls, v, values, **kwargs):
|
|
42
|
+
if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
|
|
43
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
44
|
+
return v
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IngestTaskExtractSchema(BaseModelNoExt):
|
|
48
|
+
document_type: DocumentTypeEnum
|
|
49
|
+
method: str
|
|
50
|
+
params: dict
|
|
51
|
+
|
|
52
|
+
@field_validator("document_type", mode="before")
|
|
53
|
+
@classmethod
|
|
54
|
+
def case_insensitive_document_type(cls, v):
|
|
55
|
+
if isinstance(v, str):
|
|
56
|
+
v = v.lower()
|
|
57
|
+
try:
|
|
58
|
+
return DocumentTypeEnum(v)
|
|
59
|
+
except ValueError:
|
|
60
|
+
raise ValueError(f"{v} is not a valid DocumentTypeEnum value")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class IngestTaskStoreEmbedSchema(BaseModelNoExt):
|
|
64
|
+
params: dict
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class IngestTaskStoreSchema(BaseModelNoExt):
|
|
68
|
+
structured: bool = True
|
|
69
|
+
images: bool = False
|
|
70
|
+
method: str
|
|
71
|
+
params: dict
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# Captioning: All fields are optional and override default parameters.
|
|
75
|
+
class IngestTaskCaptionSchema(BaseModelNoExt):
|
|
76
|
+
api_key: Optional[str] = None
|
|
77
|
+
endpoint_url: Optional[str] = None
|
|
78
|
+
prompt: Optional[str] = None
|
|
79
|
+
model_name: Optional[str] = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class IngestTaskFilterParamsSchema(BaseModelNoExt):
|
|
83
|
+
min_size: int = 128
|
|
84
|
+
max_aspect_ratio: Union[float, int] = 5.0
|
|
85
|
+
min_aspect_ratio: Union[float, int] = 0.2
|
|
86
|
+
filter: bool = False
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class IngestTaskFilterSchema(BaseModelNoExt):
|
|
90
|
+
# TODO: Ensure ContentTypeEnum is imported/defined as needed.
|
|
91
|
+
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
|
|
92
|
+
params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class IngestTaskDedupParams(BaseModelNoExt):
|
|
96
|
+
filter: bool = False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class IngestTaskDedupSchema(BaseModelNoExt):
|
|
100
|
+
# TODO: Ensure ContentTypeEnum is imported/defined as needed.
|
|
101
|
+
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
|
|
102
|
+
params: IngestTaskDedupParams = IngestTaskDedupParams()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class IngestTaskEmbedSchema(BaseModelNoExt):
|
|
106
|
+
endpoint_url: Optional[str] = None
|
|
107
|
+
model_name: Optional[str] = None
|
|
108
|
+
api_key: Optional[str] = None
|
|
109
|
+
filter_errors: bool = False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class IngestTaskVdbUploadSchema(BaseModelNoExt):
|
|
113
|
+
bulk_ingest: bool = False
|
|
114
|
+
bulk_ingest_path: Optional[str] = None
|
|
115
|
+
params: Optional[dict] = None
|
|
116
|
+
filter_errors: bool = True
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class IngestTaskAudioExtraction(BaseModelNoExt):
|
|
120
|
+
auth_token: Optional[str] = None
|
|
121
|
+
grpc_endpoint: Optional[str] = None
|
|
122
|
+
http_endpoint: Optional[str] = None
|
|
123
|
+
infer_protocol: Optional[str] = None
|
|
124
|
+
function_id: Optional[str] = None
|
|
125
|
+
use_ssl: Optional[bool] = None
|
|
126
|
+
ssl_cert: Optional[str] = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class IngestTaskTableExtraction(BaseModelNoExt):
|
|
130
|
+
params: dict = Field(default_factory=dict)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class IngestTaskChartExtraction(BaseModelNoExt):
|
|
134
|
+
params: dict = Field(default_factory=dict)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class IngestTaskInfographicExtraction(BaseModelNoExt):
|
|
138
|
+
params: dict = Field(default_factory=dict)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class IngestTaskSchema(BaseModelNoExt):
|
|
142
|
+
type: TaskTypeEnum
|
|
143
|
+
task_properties: Union[
|
|
144
|
+
IngestTaskSplitSchema,
|
|
145
|
+
IngestTaskExtractSchema,
|
|
146
|
+
IngestTaskStoreEmbedSchema,
|
|
147
|
+
IngestTaskStoreSchema,
|
|
148
|
+
IngestTaskEmbedSchema,
|
|
149
|
+
IngestTaskCaptionSchema,
|
|
150
|
+
IngestTaskDedupSchema,
|
|
151
|
+
IngestTaskFilterSchema,
|
|
152
|
+
IngestTaskVdbUploadSchema,
|
|
153
|
+
IngestTaskAudioExtraction,
|
|
154
|
+
IngestTaskTableExtraction,
|
|
155
|
+
IngestTaskChartExtraction,
|
|
156
|
+
IngestTaskInfographicExtraction,
|
|
157
|
+
]
|
|
158
|
+
raise_on_failure: bool = False
|
|
159
|
+
|
|
160
|
+
@model_validator(mode="before")
|
|
161
|
+
@classmethod
|
|
162
|
+
def check_task_properties_type(cls, values):
|
|
163
|
+
task_type, task_properties = values.get("type"), values.get("task_properties", {})
|
|
164
|
+
if task_type and task_properties:
|
|
165
|
+
expected_type = {
|
|
166
|
+
TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
|
|
167
|
+
TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
|
|
168
|
+
TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
|
|
169
|
+
TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
|
|
170
|
+
TaskTypeEnum.FILTER: IngestTaskFilterSchema, # Extend mapping as necessary
|
|
171
|
+
TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
|
|
172
|
+
TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
|
|
173
|
+
TaskTypeEnum.STORE: IngestTaskStoreSchema,
|
|
174
|
+
TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
|
|
175
|
+
TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
|
|
176
|
+
TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
|
|
177
|
+
TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
|
|
178
|
+
TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
|
|
179
|
+
}.get(
|
|
180
|
+
task_type
|
|
181
|
+
) # Removed .upper()
|
|
182
|
+
|
|
183
|
+
# Validate task_properties against the expected schema.
|
|
184
|
+
validated_task_properties = expected_type(**task_properties)
|
|
185
|
+
values["task_properties"] = validated_task_properties
|
|
186
|
+
return values
|
|
187
|
+
|
|
188
|
+
@field_validator("type", mode="before")
|
|
189
|
+
@classmethod
|
|
190
|
+
def case_insensitive_task_type(cls, v):
|
|
191
|
+
if isinstance(v, str):
|
|
192
|
+
v = v.lower()
|
|
193
|
+
try:
|
|
194
|
+
return TaskTypeEnum(v)
|
|
195
|
+
except ValueError:
|
|
196
|
+
raise ValueError(f"{v} is not a valid TaskTypeEnum value")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# ------------------------------------------------------------------------------
|
|
200
|
+
# Schemas: Job Schemas
|
|
201
|
+
# ------------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class JobPayloadSchema(BaseModelNoExt):
|
|
205
|
+
content: List[Union[str, bytes]]
|
|
206
|
+
source_name: List[str]
|
|
207
|
+
source_id: List[Union[str, int]]
|
|
208
|
+
document_type: List[str]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class IngestJobSchema(BaseModelNoExt):
|
|
212
|
+
job_payload: JobPayloadSchema
|
|
213
|
+
job_id: Union[str, int]
|
|
214
|
+
tasks: List[IngestTaskSchema]
|
|
215
|
+
tracing_options: Optional[TracingOptionsSchema] = None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ------------------------------------------------------------------------------
|
|
219
|
+
# Utility Functions
|
|
220
|
+
# ------------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def validate_ingest_job(job_data: Dict[str, Any]) -> IngestJobSchema:
|
|
224
|
+
"""
|
|
225
|
+
Validates a dictionary representing an ingest_job using the IngestJobSchema.
|
|
226
|
+
|
|
227
|
+
Parameters:
|
|
228
|
+
- job_data: Dictionary representing an ingest job.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
- IngestJobSchema: The validated ingest job.
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
- ValidationError: If the input data does not conform to the IngestJobSchema.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
return IngestJobSchema(**job_data)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
from typing import Dict
|
|
10
|
+
from typing import List
|
|
11
|
+
from typing import Optional
|
|
12
|
+
from typing import Union
|
|
13
|
+
|
|
14
|
+
from pydantic import field_validator, model_validator, Field
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import (
|
|
17
|
+
AccessLevelEnum,
|
|
18
|
+
ContentTypeEnum,
|
|
19
|
+
TextTypeEnum,
|
|
20
|
+
LanguageEnum,
|
|
21
|
+
TableFormatEnum,
|
|
22
|
+
StatusEnum,
|
|
23
|
+
DocumentTypeEnum,
|
|
24
|
+
TaskTypeEnum,
|
|
25
|
+
)
|
|
26
|
+
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
|
|
27
|
+
from nv_ingest_api.util.converters import datetools
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Sub schemas
|
|
33
|
+
class SourceMetadataSchema(BaseModelNoExt):
|
|
34
|
+
"""
|
|
35
|
+
Schema for the knowledge base file from which content
|
|
36
|
+
and metadata is extracted.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
source_name: str
|
|
40
|
+
source_id: str
|
|
41
|
+
source_location: str = ""
|
|
42
|
+
source_type: Union[DocumentTypeEnum, str]
|
|
43
|
+
collection_id: str = ""
|
|
44
|
+
date_created: str = datetime.now().isoformat()
|
|
45
|
+
last_modified: str = datetime.now().isoformat()
|
|
46
|
+
summary: str = ""
|
|
47
|
+
partition_id: int = -1
|
|
48
|
+
access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
|
|
49
|
+
|
|
50
|
+
@field_validator("date_created", "last_modified")
|
|
51
|
+
@classmethod
|
|
52
|
+
def validate_fields(cls, field_value):
|
|
53
|
+
datetools.validate_iso8601(field_value)
|
|
54
|
+
return field_value
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class NearbyObjectsSubSchema(BaseModelNoExt):
|
|
58
|
+
"""
|
|
59
|
+
Schema to hold related extracted object.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
content: List[str] = Field(default_factory=list)
|
|
63
|
+
bbox: List[tuple] = Field(default_factory=list)
|
|
64
|
+
type: List[str] = Field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class NearbyObjectsSchema(BaseModelNoExt):
|
|
68
|
+
"""
|
|
69
|
+
Schema to hold types of related extracted objects.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
73
|
+
images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
74
|
+
structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ContentHierarchySchema(BaseModelNoExt):
|
|
78
|
+
"""
|
|
79
|
+
Schema for the extracted content hierarchy.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
page_count: int = -1
|
|
83
|
+
page: int = -1
|
|
84
|
+
block: int = -1
|
|
85
|
+
line: int = -1
|
|
86
|
+
span: int = -1
|
|
87
|
+
nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ContentMetadataSchema(BaseModelNoExt):
|
|
91
|
+
"""
|
|
92
|
+
Data extracted from a source; generally Text or Image.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
type: ContentTypeEnum
|
|
96
|
+
description: str = ""
|
|
97
|
+
page_number: int = -1
|
|
98
|
+
hierarchy: ContentHierarchySchema = ContentHierarchySchema()
|
|
99
|
+
subtype: Union[ContentTypeEnum, str] = ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class TextMetadataSchema(BaseModelNoExt):
|
|
103
|
+
text_type: TextTypeEnum
|
|
104
|
+
summary: str = ""
|
|
105
|
+
keywords: Union[str, List[str], Dict] = ""
|
|
106
|
+
language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
|
|
107
|
+
text_location: tuple = (0, 0, 0, 0)
|
|
108
|
+
text_location_max_dimensions: tuple = (0, 0, 0, 0)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ImageMetadataSchema(BaseModelNoExt):
|
|
112
|
+
image_type: Union[DocumentTypeEnum, str]
|
|
113
|
+
structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
|
|
114
|
+
caption: str = ""
|
|
115
|
+
text: str = ""
|
|
116
|
+
image_location: tuple = (0, 0, 0, 0)
|
|
117
|
+
image_location_max_dimensions: tuple = (0, 0)
|
|
118
|
+
uploaded_image_url: str = ""
|
|
119
|
+
width: int = 0
|
|
120
|
+
height: int = 0
|
|
121
|
+
|
|
122
|
+
@field_validator("image_type")
|
|
123
|
+
def validate_image_type(cls, v):
|
|
124
|
+
if not isinstance(v, (DocumentTypeEnum, str)):
|
|
125
|
+
raise ValueError("image_type must be a string or DocumentTypeEnum")
|
|
126
|
+
return v
|
|
127
|
+
|
|
128
|
+
@field_validator("width", "height")
|
|
129
|
+
def clamp_non_negative(cls, v, field):
|
|
130
|
+
if v < 0:
|
|
131
|
+
logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
|
|
132
|
+
return 0
|
|
133
|
+
return v
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class TableMetadataSchema(BaseModelNoExt):
|
|
137
|
+
caption: str = ""
|
|
138
|
+
table_format: TableFormatEnum
|
|
139
|
+
table_content: str = ""
|
|
140
|
+
table_content_format: Union[TableFormatEnum, str] = ""
|
|
141
|
+
table_location: tuple = (0, 0, 0, 0)
|
|
142
|
+
table_location_max_dimensions: tuple = (0, 0)
|
|
143
|
+
uploaded_image_uri: str = ""
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ChartMetadataSchema(BaseModelNoExt):
|
|
147
|
+
caption: str = ""
|
|
148
|
+
table_format: TableFormatEnum
|
|
149
|
+
table_content: str = ""
|
|
150
|
+
table_content_format: Union[TableFormatEnum, str] = ""
|
|
151
|
+
table_location: tuple = (0, 0, 0, 0)
|
|
152
|
+
table_location_max_dimensions: tuple = (0, 0)
|
|
153
|
+
uploaded_image_uri: str = ""
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class AudioMetadataSchema(BaseModelNoExt):
|
|
157
|
+
audio_transcript: str = ""
|
|
158
|
+
audio_type: str = ""
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# TODO consider deprecating this in favor of info msg...
|
|
162
|
+
class ErrorMetadataSchema(BaseModelNoExt):
|
|
163
|
+
task: TaskTypeEnum
|
|
164
|
+
status: StatusEnum
|
|
165
|
+
source_id: str = ""
|
|
166
|
+
error_msg: str
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class InfoMessageMetadataSchema(BaseModelNoExt):
|
|
170
|
+
task: TaskTypeEnum
|
|
171
|
+
status: StatusEnum
|
|
172
|
+
message: str
|
|
173
|
+
filter: bool
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Main metadata schema
|
|
177
|
+
class MetadataSchema(BaseModelNoExt):
|
|
178
|
+
content: str = ""
|
|
179
|
+
content_url: str = ""
|
|
180
|
+
embedding: Optional[List[float]] = None
|
|
181
|
+
source_metadata: Optional[SourceMetadataSchema] = None
|
|
182
|
+
content_metadata: Optional[ContentMetadataSchema] = None
|
|
183
|
+
audio_metadata: Optional[AudioMetadataSchema] = None
|
|
184
|
+
text_metadata: Optional[TextMetadataSchema] = None
|
|
185
|
+
image_metadata: Optional[ImageMetadataSchema] = None
|
|
186
|
+
table_metadata: Optional[TableMetadataSchema] = None
|
|
187
|
+
chart_metadata: Optional[ChartMetadataSchema] = None
|
|
188
|
+
error_metadata: Optional[ErrorMetadataSchema] = None
|
|
189
|
+
info_message_metadata: Optional[InfoMessageMetadataSchema] = None
|
|
190
|
+
debug_metadata: Optional[Dict[str, Any]] = None
|
|
191
|
+
raise_on_failure: bool = False
|
|
192
|
+
|
|
193
|
+
@model_validator(mode="before")
|
|
194
|
+
@classmethod
|
|
195
|
+
def check_metadata_type(cls, values):
|
|
196
|
+
content_type = values.get("content_metadata", {}).get("type", None)
|
|
197
|
+
if content_type != ContentTypeEnum.AUDIO:
|
|
198
|
+
values["audio_metadata"] = None
|
|
199
|
+
if content_type != ContentTypeEnum.IMAGE:
|
|
200
|
+
values["image_metadata"] = None
|
|
201
|
+
if content_type != ContentTypeEnum.TEXT:
|
|
202
|
+
values["text_metadata"] = None
|
|
203
|
+
if content_type != ContentTypeEnum.STRUCTURED:
|
|
204
|
+
values["table_metadata"] = None
|
|
205
|
+
return values
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
|
|
209
|
+
"""
|
|
210
|
+
Validates the given metadata dictionary against the MetadataSchema.
|
|
211
|
+
|
|
212
|
+
Parameters:
|
|
213
|
+
- metadata: A dictionary representing metadata to be validated.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
- An instance of MetadataSchema if validation is successful.
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
- ValidationError: If the metadata does not conform to the schema.
|
|
220
|
+
"""
|
|
221
|
+
return MetadataSchema(**metadata)
|