nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# NOTE: This code is duplicated from the ingest service:
|
|
6
|
+
# src/nv_ingest_client/schemas/response_schema.py
|
|
7
|
+
# Eventually we should move all client wrappers for the message broker into a shared library that both the ingest
|
|
8
|
+
# service and the client can use.
|
|
9
|
+
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResponseSchema(BaseModel):
|
|
15
|
+
response_code: int
|
|
16
|
+
response_reason: Optional[str] = "OK"
|
|
17
|
+
response: Union[str, dict, None] = None
|
|
18
|
+
trace_id: Optional[str] = None # Unique trace ID
|
|
19
|
+
transaction_id: Optional[str] = None # Unique transaction ID
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Define a base class with extra fields forbidden
|
|
10
|
+
class BaseModelNoExt(BaseModel):
|
|
11
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union, Annotated
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, field_validator, model_validator
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
|
|
11
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum, TaskTypeEnum, DocumentTypeEnum
|
|
12
|
+
|
|
13
|
+
# ------------------------------------------------------------------------------
|
|
14
|
+
# Logging Configuration
|
|
15
|
+
# ------------------------------------------------------------------------------
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ------------------------------------------------------------------------------
|
|
20
|
+
# Schemas: Common and Task-Specific
|
|
21
|
+
# ------------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Tracing Options Schema
|
|
25
|
+
class TracingOptionsSchema(BaseModelNoExt):
|
|
26
|
+
trace: bool = False
|
|
27
|
+
ts_send: Optional[int] = None
|
|
28
|
+
trace_id: Optional[str] = None
|
|
29
|
+
# V2 PDF splitting support
|
|
30
|
+
parent_job_id: Optional[str] = None
|
|
31
|
+
page_num: Optional[int] = None
|
|
32
|
+
total_pages: Optional[int] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# PDF Configuration Schema
|
|
36
|
+
class PdfConfigSchema(BaseModelNoExt):
|
|
37
|
+
"""PDF-specific configuration options for job submission.
|
|
38
|
+
|
|
39
|
+
Note: split_page_count accepts any positive integer but will be clamped
|
|
40
|
+
to [1, 128] range by the server at runtime.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
split_page_count: Annotated[int, Field(ge=1)] = 32
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class RoutingOptionsSchema(BaseModelNoExt):
|
|
47
|
+
# Queue routing hint for QoS scheduler
|
|
48
|
+
queue_hint: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
@field_validator("queue_hint")
|
|
51
|
+
@classmethod
|
|
52
|
+
def validate_queue_hint(cls, v):
|
|
53
|
+
if v is None:
|
|
54
|
+
return v
|
|
55
|
+
if not isinstance(v, str):
|
|
56
|
+
raise ValueError("queue_hint must be a string")
|
|
57
|
+
s = v.lower()
|
|
58
|
+
allowed = {"default", "immediate", "micro", "small", "medium", "large"}
|
|
59
|
+
if s not in allowed:
|
|
60
|
+
raise ValueError("queue_hint must be one of: default, immediate, micro, small, medium, large")
|
|
61
|
+
return s
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Ingest Task Schemas
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class IngestTaskSplitSchema(BaseModelNoExt):
|
|
68
|
+
tokenizer: Optional[str] = None
|
|
69
|
+
chunk_size: Annotated[int, Field(gt=0)] = 1024
|
|
70
|
+
chunk_overlap: Annotated[int, Field(ge=0)] = 150
|
|
71
|
+
params: dict = Field(default_factory=dict)
|
|
72
|
+
|
|
73
|
+
@field_validator("chunk_overlap")
|
|
74
|
+
def check_chunk_overlap(cls, v, values, **kwargs):
|
|
75
|
+
if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
|
|
76
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
77
|
+
return v
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class IngestTaskExtractSchema(BaseModelNoExt):
|
|
81
|
+
document_type: DocumentTypeEnum
|
|
82
|
+
method: str
|
|
83
|
+
params: dict = Field(default_factory=dict)
|
|
84
|
+
|
|
85
|
+
@field_validator("document_type", mode="before")
|
|
86
|
+
@classmethod
|
|
87
|
+
def case_insensitive_document_type(cls, v):
|
|
88
|
+
if isinstance(v, str):
|
|
89
|
+
v = v.lower()
|
|
90
|
+
try:
|
|
91
|
+
return DocumentTypeEnum(v)
|
|
92
|
+
except ValueError:
|
|
93
|
+
raise ValueError(f"{v} is not a valid DocumentTypeEnum value")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class IngestTaskStoreEmbedSchema(BaseModelNoExt):
|
|
97
|
+
params: dict = Field(default_factory=dict)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class IngestTaskStoreSchema(BaseModelNoExt):
|
|
101
|
+
structured: bool = True
|
|
102
|
+
images: bool = False
|
|
103
|
+
storage_uri: Optional[str] = None
|
|
104
|
+
storage_options: dict = Field(default_factory=dict)
|
|
105
|
+
public_base_url: Optional[str] = None
|
|
106
|
+
params: dict = Field(default_factory=dict)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# Captioning: All fields are optional and override default parameters.
|
|
110
|
+
class IngestTaskCaptionSchema(BaseModelNoExt):
|
|
111
|
+
api_key: Optional[str] = Field(default=None, repr=False)
|
|
112
|
+
endpoint_url: Optional[str] = None
|
|
113
|
+
prompt: Optional[str] = None
|
|
114
|
+
system_prompt: Optional[str] = None
|
|
115
|
+
model_name: Optional[str] = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class IngestTaskFilterParamsSchema(BaseModelNoExt):
|
|
119
|
+
min_size: int = 128
|
|
120
|
+
max_aspect_ratio: Union[float, int] = 5.0
|
|
121
|
+
min_aspect_ratio: Union[float, int] = 0.2
|
|
122
|
+
filter: bool = False
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class IngestTaskFilterSchema(BaseModelNoExt):
|
|
126
|
+
# TODO: Ensure ContentTypeEnum is imported/defined as needed.
|
|
127
|
+
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
|
|
128
|
+
params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class IngestTaskDedupParams(BaseModelNoExt):
|
|
132
|
+
filter: bool = False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class IngestTaskDedupSchema(BaseModelNoExt):
|
|
136
|
+
# TODO: Ensure ContentTypeEnum is imported/defined as needed.
|
|
137
|
+
content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
|
|
138
|
+
params: IngestTaskDedupParams = IngestTaskDedupParams()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class IngestTaskEmbedSchema(BaseModelNoExt):
|
|
142
|
+
endpoint_url: Optional[str] = None
|
|
143
|
+
model_name: Optional[str] = None
|
|
144
|
+
api_key: Optional[str] = Field(default=None, repr=False)
|
|
145
|
+
filter_errors: bool = False
|
|
146
|
+
text_elements_modality: Optional[str] = None
|
|
147
|
+
image_elements_modality: Optional[str] = None
|
|
148
|
+
structured_elements_modality: Optional[str] = None
|
|
149
|
+
audio_elements_modality: Optional[str] = None
|
|
150
|
+
custom_content_field: Optional[str] = None
|
|
151
|
+
result_target_field: Optional[str] = None
|
|
152
|
+
dimensions: Optional[int] = None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class IngestTaskVdbUploadSchema(BaseModelNoExt):
|
|
156
|
+
bulk_ingest: bool = False
|
|
157
|
+
bulk_ingest_path: Optional[str] = None
|
|
158
|
+
params: Optional[dict] = None
|
|
159
|
+
filter_errors: bool = True
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class IngestTaskAudioExtraction(BaseModelNoExt):
|
|
163
|
+
auth_token: Optional[str] = Field(default=None, repr=False)
|
|
164
|
+
grpc_endpoint: Optional[str] = None
|
|
165
|
+
http_endpoint: Optional[str] = None
|
|
166
|
+
infer_protocol: Optional[str] = None
|
|
167
|
+
function_id: Optional[str] = None
|
|
168
|
+
use_ssl: Optional[bool] = None
|
|
169
|
+
ssl_cert: Optional[str] = Field(default=None, repr=False)
|
|
170
|
+
segment_audio: Optional[bool] = None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class IngestTaskTableExtraction(BaseModelNoExt):
|
|
174
|
+
params: dict = Field(default_factory=dict)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class IngestTaskChartExtraction(BaseModelNoExt):
|
|
178
|
+
params: dict = Field(default_factory=dict)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class IngestTaskInfographicExtraction(BaseModelNoExt):
|
|
182
|
+
params: dict = Field(default_factory=dict)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class IngestTaskOCRExtraction(BaseModelNoExt):
|
|
186
|
+
params: dict = Field(default_factory=dict)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class IngestTaskUDFSchema(BaseModelNoExt):
|
|
190
|
+
udf_function: str
|
|
191
|
+
udf_function_name: str
|
|
192
|
+
phase: Optional[int] = Field(default=None, ge=1, le=5)
|
|
193
|
+
run_before: bool = Field(default=False, description="Execute UDF before the target stage")
|
|
194
|
+
run_after: bool = Field(default=False, description="Execute UDF after the target stage")
|
|
195
|
+
target_stage: Optional[str] = Field(
|
|
196
|
+
default=None, description="Name of the stage to target (e.g., 'image_dedup', 'text_extract')"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
@model_validator(mode="after")
|
|
200
|
+
def validate_stage_targeting(self):
|
|
201
|
+
"""Validate that stage targeting configuration is consistent"""
|
|
202
|
+
# Must specify either phase or target_stage, but not both
|
|
203
|
+
has_phase = self.phase is not None
|
|
204
|
+
has_target_stage = self.target_stage is not None
|
|
205
|
+
|
|
206
|
+
if has_phase and has_target_stage:
|
|
207
|
+
raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
|
|
208
|
+
elif not has_phase and not has_target_stage:
|
|
209
|
+
raise ValueError("Must specify either 'phase' or 'target_stage'.")
|
|
210
|
+
|
|
211
|
+
# If using run_before or run_after, must specify target_stage
|
|
212
|
+
if self.run_before or self.run_after:
|
|
213
|
+
if not self.target_stage:
|
|
214
|
+
raise ValueError("target_stage must be specified when using run_before or run_after")
|
|
215
|
+
|
|
216
|
+
# If target_stage is specified, must have at least one timing
|
|
217
|
+
if self.target_stage and not (self.run_before or self.run_after):
|
|
218
|
+
raise ValueError("At least one of run_before or run_after must be True when target_stage is specified")
|
|
219
|
+
|
|
220
|
+
return self
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class IngestTaskSchema(BaseModelNoExt):
|
|
224
|
+
type: TaskTypeEnum
|
|
225
|
+
task_properties: Union[
|
|
226
|
+
IngestTaskSplitSchema,
|
|
227
|
+
IngestTaskExtractSchema,
|
|
228
|
+
IngestTaskStoreEmbedSchema,
|
|
229
|
+
IngestTaskStoreSchema,
|
|
230
|
+
IngestTaskEmbedSchema,
|
|
231
|
+
IngestTaskCaptionSchema,
|
|
232
|
+
IngestTaskDedupSchema,
|
|
233
|
+
IngestTaskFilterSchema,
|
|
234
|
+
IngestTaskVdbUploadSchema,
|
|
235
|
+
IngestTaskAudioExtraction,
|
|
236
|
+
IngestTaskTableExtraction,
|
|
237
|
+
IngestTaskChartExtraction,
|
|
238
|
+
IngestTaskInfographicExtraction,
|
|
239
|
+
IngestTaskOCRExtraction,
|
|
240
|
+
IngestTaskUDFSchema,
|
|
241
|
+
]
|
|
242
|
+
raise_on_failure: bool = False
|
|
243
|
+
|
|
244
|
+
@model_validator(mode="before")
|
|
245
|
+
@classmethod
|
|
246
|
+
def check_task_properties_type(cls, values):
|
|
247
|
+
task_type = values.get("type")
|
|
248
|
+
task_properties = values.get("task_properties", {})
|
|
249
|
+
|
|
250
|
+
# Ensure task_type is lowercased and converted to enum early
|
|
251
|
+
if isinstance(task_type, str):
|
|
252
|
+
task_type = task_type.lower()
|
|
253
|
+
try:
|
|
254
|
+
task_type = TaskTypeEnum(task_type)
|
|
255
|
+
except ValueError:
|
|
256
|
+
raise ValueError(f"{task_type} is not a valid TaskTypeEnum value")
|
|
257
|
+
|
|
258
|
+
task_type_to_schema = {
|
|
259
|
+
TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
|
|
260
|
+
TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
|
|
261
|
+
TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
|
|
262
|
+
TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
|
|
263
|
+
TaskTypeEnum.FILTER: IngestTaskFilterSchema,
|
|
264
|
+
TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
|
|
265
|
+
TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
|
|
266
|
+
TaskTypeEnum.STORE: IngestTaskStoreSchema,
|
|
267
|
+
TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
|
|
268
|
+
TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
|
|
269
|
+
TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
|
|
270
|
+
TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
|
|
271
|
+
TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
|
|
272
|
+
TaskTypeEnum.OCR_DATA_EXTRACT: IngestTaskOCRExtraction,
|
|
273
|
+
TaskTypeEnum.UDF: IngestTaskUDFSchema,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
expected_schema_cls = task_type_to_schema.get(task_type)
|
|
277
|
+
if expected_schema_cls is None:
|
|
278
|
+
raise ValueError(f"Unsupported or missing task_type '{task_type}'")
|
|
279
|
+
|
|
280
|
+
validated_task_properties = expected_schema_cls(**task_properties)
|
|
281
|
+
values["type"] = task_type # ensure type is now always the enum
|
|
282
|
+
values["task_properties"] = validated_task_properties
|
|
283
|
+
|
|
284
|
+
return values
|
|
285
|
+
|
|
286
|
+
@field_validator("type", mode="before")
|
|
287
|
+
@classmethod
|
|
288
|
+
def case_insensitive_task_type(cls, v):
|
|
289
|
+
if isinstance(v, str):
|
|
290
|
+
v = v.lower()
|
|
291
|
+
try:
|
|
292
|
+
return TaskTypeEnum(v)
|
|
293
|
+
except ValueError:
|
|
294
|
+
raise ValueError(f"{v} is not a valid TaskTypeEnum value")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# ------------------------------------------------------------------------------
|
|
298
|
+
# Schemas: Job Schemas
|
|
299
|
+
# ------------------------------------------------------------------------------
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class JobPayloadSchema(BaseModelNoExt):
|
|
303
|
+
content: List[Union[str, bytes]]
|
|
304
|
+
source_name: List[str]
|
|
305
|
+
source_id: List[Union[str, int]]
|
|
306
|
+
document_type: List[str]
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class IngestJobSchema(BaseModelNoExt):
|
|
310
|
+
job_payload: JobPayloadSchema
|
|
311
|
+
job_id: Union[str, int]
|
|
312
|
+
tasks: List[IngestTaskSchema]
|
|
313
|
+
tracing_options: Optional[TracingOptionsSchema] = None
|
|
314
|
+
routing_options: Optional[RoutingOptionsSchema] = None
|
|
315
|
+
pdf_config: Optional[PdfConfigSchema] = None
|
|
316
|
+
|
|
317
|
+
@model_validator(mode="before")
|
|
318
|
+
@classmethod
|
|
319
|
+
def migrate_queue_hint(cls, values):
|
|
320
|
+
"""
|
|
321
|
+
Backward-compatibility shim: if a legacy client sends
|
|
322
|
+
tracing_options.queue_hint, move it into routing_options.queue_hint.
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
topt = values.get("tracing_options") or {}
|
|
326
|
+
ropt = values.get("routing_options") or {}
|
|
327
|
+
if isinstance(topt, dict) and "queue_hint" in topt and "queue_hint" not in ropt:
|
|
328
|
+
ropt["queue_hint"] = topt.pop("queue_hint")
|
|
329
|
+
values["routing_options"] = ropt
|
|
330
|
+
values["tracing_options"] = topt
|
|
331
|
+
except Exception:
|
|
332
|
+
pass
|
|
333
|
+
return values
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ------------------------------------------------------------------------------
|
|
337
|
+
# Utility Functions
|
|
338
|
+
# ------------------------------------------------------------------------------
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def validate_ingest_job(job_data: Dict[str, Any]) -> IngestJobSchema:
|
|
342
|
+
"""
|
|
343
|
+
Validates a dictionary representing an ingest_job using the IngestJobSchema.
|
|
344
|
+
|
|
345
|
+
Parameters:
|
|
346
|
+
- job_data: Dictionary representing an ingest job.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
- IngestJobSchema: The validated ingest job.
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
- ValidationError: If the input data does not conform to the IngestJobSchema.
|
|
353
|
+
"""
|
|
354
|
+
|
|
355
|
+
return IngestJobSchema(**job_data)
|