nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,19 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # NOTE: This code is duplicated from the ingest service:
6
+ # src/nv_ingest_client/schemas/response_schema.py
7
+ # Eventually we should move all client wrappers for the message broker into a shared library that both the ingest
8
+ # service and the client can use.
9
+
10
+ from typing import Optional, Union
11
+ from pydantic import BaseModel
12
+
13
+
14
+ class ResponseSchema(BaseModel):
15
+ response_code: int
16
+ response_reason: Optional[str] = "OK"
17
+ response: Union[str, dict, None] = None
18
+ trace_id: Optional[str] = None # Unique trace ID
19
+ transaction_id: Optional[str] = None # Unique transaction ID
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,11 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel
7
+
8
+
9
+ # Define a base class with extra fields forbidden
10
+ class BaseModelNoExt(BaseModel):
11
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,355 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any, Dict, List, Optional, Union, Annotated
7
+
8
+ from pydantic import Field, field_validator, model_validator
9
+
10
+ from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
11
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum, TaskTypeEnum, DocumentTypeEnum
12
+
13
+ # ------------------------------------------------------------------------------
14
+ # Logging Configuration
15
+ # ------------------------------------------------------------------------------
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # ------------------------------------------------------------------------------
20
+ # Schemas: Common and Task-Specific
21
+ # ------------------------------------------------------------------------------
22
+
23
+
24
+ # Tracing Options Schema
25
+ class TracingOptionsSchema(BaseModelNoExt):
26
+ trace: bool = False
27
+ ts_send: Optional[int] = None
28
+ trace_id: Optional[str] = None
29
+ # V2 PDF splitting support
30
+ parent_job_id: Optional[str] = None
31
+ page_num: Optional[int] = None
32
+ total_pages: Optional[int] = None
33
+
34
+
35
+ # PDF Configuration Schema
36
+ class PdfConfigSchema(BaseModelNoExt):
37
+ """PDF-specific configuration options for job submission.
38
+
39
+ Note: split_page_count accepts any positive integer but will be clamped
40
+ to [1, 128] range by the server at runtime.
41
+ """
42
+
43
+ split_page_count: Annotated[int, Field(ge=1)] = 32
44
+
45
+
46
+ class RoutingOptionsSchema(BaseModelNoExt):
47
+ # Queue routing hint for QoS scheduler
48
+ queue_hint: Optional[str] = None
49
+
50
+ @field_validator("queue_hint")
51
+ @classmethod
52
+ def validate_queue_hint(cls, v):
53
+ if v is None:
54
+ return v
55
+ if not isinstance(v, str):
56
+ raise ValueError("queue_hint must be a string")
57
+ s = v.lower()
58
+ allowed = {"default", "immediate", "micro", "small", "medium", "large"}
59
+ if s not in allowed:
60
+ raise ValueError("queue_hint must be one of: default, immediate, micro, small, medium, large")
61
+ return s
62
+
63
+
64
+ # Ingest Task Schemas
65
+
66
+
67
+ class IngestTaskSplitSchema(BaseModelNoExt):
68
+ tokenizer: Optional[str] = None
69
+ chunk_size: Annotated[int, Field(gt=0)] = 1024
70
+ chunk_overlap: Annotated[int, Field(ge=0)] = 150
71
+ params: dict = Field(default_factory=dict)
72
+
73
+ @field_validator("chunk_overlap")
74
+ def check_chunk_overlap(cls, v, values, **kwargs):
75
+ if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
76
+ raise ValueError("chunk_overlap must be less than chunk_size")
77
+ return v
78
+
79
+
80
+ class IngestTaskExtractSchema(BaseModelNoExt):
81
+ document_type: DocumentTypeEnum
82
+ method: str
83
+ params: dict = Field(default_factory=dict)
84
+
85
+ @field_validator("document_type", mode="before")
86
+ @classmethod
87
+ def case_insensitive_document_type(cls, v):
88
+ if isinstance(v, str):
89
+ v = v.lower()
90
+ try:
91
+ return DocumentTypeEnum(v)
92
+ except ValueError:
93
+ raise ValueError(f"{v} is not a valid DocumentTypeEnum value")
94
+
95
+
96
+ class IngestTaskStoreEmbedSchema(BaseModelNoExt):
97
+ params: dict = Field(default_factory=dict)
98
+
99
+
100
+ class IngestTaskStoreSchema(BaseModelNoExt):
101
+ structured: bool = True
102
+ images: bool = False
103
+ storage_uri: Optional[str] = None
104
+ storage_options: dict = Field(default_factory=dict)
105
+ public_base_url: Optional[str] = None
106
+ params: dict = Field(default_factory=dict)
107
+
108
+
109
+ # Captioning: All fields are optional and override default parameters.
110
+ class IngestTaskCaptionSchema(BaseModelNoExt):
111
+ api_key: Optional[str] = Field(default=None, repr=False)
112
+ endpoint_url: Optional[str] = None
113
+ prompt: Optional[str] = None
114
+ system_prompt: Optional[str] = None
115
+ model_name: Optional[str] = None
116
+
117
+
118
+ class IngestTaskFilterParamsSchema(BaseModelNoExt):
119
+ min_size: int = 128
120
+ max_aspect_ratio: Union[float, int] = 5.0
121
+ min_aspect_ratio: Union[float, int] = 0.2
122
+ filter: bool = False
123
+
124
+
125
+ class IngestTaskFilterSchema(BaseModelNoExt):
126
+ # TODO: Ensure ContentTypeEnum is imported/defined as needed.
127
+ content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
128
+ params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema()
129
+
130
+
131
+ class IngestTaskDedupParams(BaseModelNoExt):
132
+ filter: bool = False
133
+
134
+
135
+ class IngestTaskDedupSchema(BaseModelNoExt):
136
+ # TODO: Ensure ContentTypeEnum is imported/defined as needed.
137
+ content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
138
+ params: IngestTaskDedupParams = IngestTaskDedupParams()
139
+
140
+
141
+ class IngestTaskEmbedSchema(BaseModelNoExt):
142
+ endpoint_url: Optional[str] = None
143
+ model_name: Optional[str] = None
144
+ api_key: Optional[str] = Field(default=None, repr=False)
145
+ filter_errors: bool = False
146
+ text_elements_modality: Optional[str] = None
147
+ image_elements_modality: Optional[str] = None
148
+ structured_elements_modality: Optional[str] = None
149
+ audio_elements_modality: Optional[str] = None
150
+ custom_content_field: Optional[str] = None
151
+ result_target_field: Optional[str] = None
152
+ dimensions: Optional[int] = None
153
+
154
+
155
+ class IngestTaskVdbUploadSchema(BaseModelNoExt):
156
+ bulk_ingest: bool = False
157
+ bulk_ingest_path: Optional[str] = None
158
+ params: Optional[dict] = None
159
+ filter_errors: bool = True
160
+
161
+
162
+ class IngestTaskAudioExtraction(BaseModelNoExt):
163
+ auth_token: Optional[str] = Field(default=None, repr=False)
164
+ grpc_endpoint: Optional[str] = None
165
+ http_endpoint: Optional[str] = None
166
+ infer_protocol: Optional[str] = None
167
+ function_id: Optional[str] = None
168
+ use_ssl: Optional[bool] = None
169
+ ssl_cert: Optional[str] = Field(default=None, repr=False)
170
+ segment_audio: Optional[bool] = None
171
+
172
+
173
+ class IngestTaskTableExtraction(BaseModelNoExt):
174
+ params: dict = Field(default_factory=dict)
175
+
176
+
177
+ class IngestTaskChartExtraction(BaseModelNoExt):
178
+ params: dict = Field(default_factory=dict)
179
+
180
+
181
+ class IngestTaskInfographicExtraction(BaseModelNoExt):
182
+ params: dict = Field(default_factory=dict)
183
+
184
+
185
+ class IngestTaskOCRExtraction(BaseModelNoExt):
186
+ params: dict = Field(default_factory=dict)
187
+
188
+
189
+ class IngestTaskUDFSchema(BaseModelNoExt):
190
+ udf_function: str
191
+ udf_function_name: str
192
+ phase: Optional[int] = Field(default=None, ge=1, le=5)
193
+ run_before: bool = Field(default=False, description="Execute UDF before the target stage")
194
+ run_after: bool = Field(default=False, description="Execute UDF after the target stage")
195
+ target_stage: Optional[str] = Field(
196
+ default=None, description="Name of the stage to target (e.g., 'image_dedup', 'text_extract')"
197
+ )
198
+
199
+ @model_validator(mode="after")
200
+ def validate_stage_targeting(self):
201
+ """Validate that stage targeting configuration is consistent"""
202
+ # Must specify either phase or target_stage, but not both
203
+ has_phase = self.phase is not None
204
+ has_target_stage = self.target_stage is not None
205
+
206
+ if has_phase and has_target_stage:
207
+ raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
208
+ elif not has_phase and not has_target_stage:
209
+ raise ValueError("Must specify either 'phase' or 'target_stage'.")
210
+
211
+ # If using run_before or run_after, must specify target_stage
212
+ if self.run_before or self.run_after:
213
+ if not self.target_stage:
214
+ raise ValueError("target_stage must be specified when using run_before or run_after")
215
+
216
+ # If target_stage is specified, must have at least one timing
217
+ if self.target_stage and not (self.run_before or self.run_after):
218
+ raise ValueError("At least one of run_before or run_after must be True when target_stage is specified")
219
+
220
+ return self
221
+
222
+
223
+ class IngestTaskSchema(BaseModelNoExt):
224
+ type: TaskTypeEnum
225
+ task_properties: Union[
226
+ IngestTaskSplitSchema,
227
+ IngestTaskExtractSchema,
228
+ IngestTaskStoreEmbedSchema,
229
+ IngestTaskStoreSchema,
230
+ IngestTaskEmbedSchema,
231
+ IngestTaskCaptionSchema,
232
+ IngestTaskDedupSchema,
233
+ IngestTaskFilterSchema,
234
+ IngestTaskVdbUploadSchema,
235
+ IngestTaskAudioExtraction,
236
+ IngestTaskTableExtraction,
237
+ IngestTaskChartExtraction,
238
+ IngestTaskInfographicExtraction,
239
+ IngestTaskOCRExtraction,
240
+ IngestTaskUDFSchema,
241
+ ]
242
+ raise_on_failure: bool = False
243
+
244
+ @model_validator(mode="before")
245
+ @classmethod
246
+ def check_task_properties_type(cls, values):
247
+ task_type = values.get("type")
248
+ task_properties = values.get("task_properties", {})
249
+
250
+ # Ensure task_type is lowercased and converted to enum early
251
+ if isinstance(task_type, str):
252
+ task_type = task_type.lower()
253
+ try:
254
+ task_type = TaskTypeEnum(task_type)
255
+ except ValueError:
256
+ raise ValueError(f"{task_type} is not a valid TaskTypeEnum value")
257
+
258
+ task_type_to_schema = {
259
+ TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
260
+ TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
261
+ TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
262
+ TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
263
+ TaskTypeEnum.FILTER: IngestTaskFilterSchema,
264
+ TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
265
+ TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
266
+ TaskTypeEnum.STORE: IngestTaskStoreSchema,
267
+ TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
268
+ TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
269
+ TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
270
+ TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
271
+ TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
272
+ TaskTypeEnum.OCR_DATA_EXTRACT: IngestTaskOCRExtraction,
273
+ TaskTypeEnum.UDF: IngestTaskUDFSchema,
274
+ }
275
+
276
+ expected_schema_cls = task_type_to_schema.get(task_type)
277
+ if expected_schema_cls is None:
278
+ raise ValueError(f"Unsupported or missing task_type '{task_type}'")
279
+
280
+ validated_task_properties = expected_schema_cls(**task_properties)
281
+ values["type"] = task_type # ensure type is now always the enum
282
+ values["task_properties"] = validated_task_properties
283
+
284
+ return values
285
+
286
+ @field_validator("type", mode="before")
287
+ @classmethod
288
+ def case_insensitive_task_type(cls, v):
289
+ if isinstance(v, str):
290
+ v = v.lower()
291
+ try:
292
+ return TaskTypeEnum(v)
293
+ except ValueError:
294
+ raise ValueError(f"{v} is not a valid TaskTypeEnum value")
295
+
296
+
297
+ # ------------------------------------------------------------------------------
298
+ # Schemas: Job Schemas
299
+ # ------------------------------------------------------------------------------
300
+
301
+
302
+ class JobPayloadSchema(BaseModelNoExt):
303
+ content: List[Union[str, bytes]]
304
+ source_name: List[str]
305
+ source_id: List[Union[str, int]]
306
+ document_type: List[str]
307
+
308
+
309
+ class IngestJobSchema(BaseModelNoExt):
310
+ job_payload: JobPayloadSchema
311
+ job_id: Union[str, int]
312
+ tasks: List[IngestTaskSchema]
313
+ tracing_options: Optional[TracingOptionsSchema] = None
314
+ routing_options: Optional[RoutingOptionsSchema] = None
315
+ pdf_config: Optional[PdfConfigSchema] = None
316
+
317
+ @model_validator(mode="before")
318
+ @classmethod
319
+ def migrate_queue_hint(cls, values):
320
+ """
321
+ Backward-compatibility shim: if a legacy client sends
322
+ tracing_options.queue_hint, move it into routing_options.queue_hint.
323
+ """
324
+ try:
325
+ topt = values.get("tracing_options") or {}
326
+ ropt = values.get("routing_options") or {}
327
+ if isinstance(topt, dict) and "queue_hint" in topt and "queue_hint" not in ropt:
328
+ ropt["queue_hint"] = topt.pop("queue_hint")
329
+ values["routing_options"] = ropt
330
+ values["tracing_options"] = topt
331
+ except Exception:
332
+ pass
333
+ return values
334
+
335
+
336
+ # ------------------------------------------------------------------------------
337
+ # Utility Functions
338
+ # ------------------------------------------------------------------------------
339
+
340
+
341
+ def validate_ingest_job(job_data: Dict[str, Any]) -> IngestJobSchema:
342
+ """
343
+ Validates a dictionary representing an ingest_job using the IngestJobSchema.
344
+
345
+ Parameters:
346
+ - job_data: Dictionary representing an ingest job.
347
+
348
+ Returns:
349
+ - IngestJobSchema: The validated ingest job.
350
+
351
+ Raises:
352
+ - ValidationError: If the input data does not conform to the IngestJobSchema.
353
+ """
354
+
355
+ return IngestJobSchema(**job_data)