nv-ingest-client 2025.8.14.dev20250814__tar.gz → 2025.8.16.dev20250816__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (65) hide show
  1. {nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client.egg-info → nv_ingest_client-2025.8.16.dev20250816}/PKG-INFO +1 -1
  2. nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/cli/util/click.py +525 -0
  3. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/interface.py +209 -26
  4. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/nv_ingest_cli.py +16 -0
  5. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/job_spec.py +29 -9
  6. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/__init__.py +6 -4
  7. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  8. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/caption.py +10 -16
  9. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  10. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/dedup.py +12 -21
  11. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/embed.py +21 -76
  12. nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/extract.py +241 -0
  13. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/filter.py +21 -27
  14. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  15. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/split.py +17 -18
  16. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/store.py +29 -29
  17. nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/task_base.py +74 -0
  18. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/task_factory.py +2 -0
  19. nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/udf.py +352 -0
  20. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/milvus.py +1 -0
  21. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  22. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -3
  23. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/cli/util/click.py +0 -373
  24. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/cli/util/tasks.py +0 -3
  25. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/exceptions.py +0 -0
  26. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/extract.py +0 -342
  27. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/task_base.py +0 -145
  28. nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/transform.py +0 -0
  29. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/LICENSE +0 -0
  30. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/MANIFEST.in +0 -0
  31. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/README.md +0 -0
  32. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/pyproject.toml +0 -0
  33. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/setup.cfg +0 -0
  34. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/__init__.py +0 -0
  35. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/__init__.py +0 -0
  36. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  37. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/processing.py +0 -0
  38. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/system.py +0 -0
  39. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/__init__.py +0 -0
  40. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/client.py +0 -0
  41. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/util/processing.py +0 -0
  42. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/__init__.py +0 -0
  43. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  44. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  45. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  46. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  47. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/__init__.py +0 -0
  48. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/dataset.py +0 -0
  49. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  50. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  51. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/milvus.py +0 -0
  52. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/process_json_files.py +0 -0
  53. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/processing.py +0 -0
  54. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/system.py +0 -0
  55. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/transport.py +0 -0
  56. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/util.py +0 -0
  57. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  58. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  59. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  60. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/zipkin.py +0 -0
  61. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  62. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  63. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  64. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  65. {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.14.dev20250814
3
+ Version: 2025.8.16.dev20250816
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,525 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ import random
10
+ from enum import Enum
11
+ from pprint import pprint
12
+ from typing import Union, List, Any, Dict
13
+
14
+ import click
15
+
16
+ from nv_ingest_api.internal.enums.common import PipelinePhase
17
+ from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
18
+ from nv_ingest_client.util.processing import check_schema
19
+ from nv_ingest_client.primitives.tasks import CaptionTask
20
+ from nv_ingest_client.primitives.tasks import DedupTask
21
+ from nv_ingest_client.primitives.tasks import EmbedTask
22
+ from nv_ingest_client.primitives.tasks import ExtractTask
23
+ from nv_ingest_client.primitives.tasks import FilterTask
24
+ from nv_ingest_client.primitives.tasks import InfographicExtractionTask
25
+ from nv_ingest_client.primitives.tasks import SplitTask
26
+ from nv_ingest_client.primitives.tasks import StoreEmbedTask
27
+ from nv_ingest_client.primitives.tasks import StoreTask
28
+ from nv_ingest_client.primitives.tasks import UDFTask
29
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
30
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
31
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
32
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
33
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
34
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
35
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
36
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
37
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
38
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
39
+ from nv_ingest_client.util.util import generate_matching_files
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class LogLevel(str, Enum):
45
+ """
46
+ Enum for specifying logging levels.
47
+
48
+ Attributes
49
+ ----------
50
+ DEBUG : str
51
+ Debug logging level.
52
+ INFO : str
53
+ Informational logging level.
54
+ WARNING : str
55
+ Warning logging level.
56
+ ERROR : str
57
+ Error logging level.
58
+ CRITICAL : str
59
+ Critical logging level.
60
+ """
61
+
62
+ DEBUG = "DEBUG"
63
+ INFO = "INFO"
64
+ WARNING = "WARNING"
65
+ ERROR = "ERROR"
66
+ CRITICAL = "CRITICAL"
67
+
68
+
69
+ class ClientType(str, Enum):
70
+ """
71
+ Enum for specifying client types.
72
+
73
+ Attributes
74
+ ----------
75
+ REST : str
76
+ Represents a REST client.
77
+ REDIS : str
78
+ Represents a Redis client.
79
+ KAFKA : str
80
+ Represents a Kafka client.
81
+ """
82
+
83
+ REST = "REST"
84
+ REDIS = "REDIS"
85
+ KAFKA = "KAFKA"
86
+
87
+
88
+ def debug_print_click_options(ctx: click.Context) -> None:
89
+ """
90
+ Retrieves all options from the Click context and pretty prints them.
91
+
92
+ Parameters
93
+ ----------
94
+ ctx : click.Context
95
+ The Click context object from which to retrieve the command options.
96
+ """
97
+ click_options: Dict[str, Any] = {}
98
+ for param in ctx.command.params:
99
+ if isinstance(param, click.Option):
100
+ value = ctx.params[param.name]
101
+ click_options[param.name] = value
102
+
103
+ pprint(click_options)
104
+
105
+
106
+ def click_validate_file_exists(
107
+ ctx: click.Context, param: click.Parameter, value: Union[str, List[str], None]
108
+ ) -> List[str]:
109
+ """
110
+ Validates that the given file(s) exist.
111
+
112
+ Parameters
113
+ ----------
114
+ ctx : click.Context
115
+ The Click context.
116
+ param : click.Parameter
117
+ The parameter associated with the file option.
118
+ value : Union[str, List[str], None]
119
+ A file path or a list of file paths.
120
+
121
+ Returns
122
+ -------
123
+ List[str]
124
+ A list of validated file paths.
125
+
126
+ Raises
127
+ ------
128
+ click.BadParameter
129
+ If any file path does not exist.
130
+ """
131
+ if not value:
132
+ return []
133
+
134
+ if isinstance(value, str):
135
+ value = [value]
136
+ else:
137
+ value = list(value)
138
+
139
+ for filepath in value:
140
+ if not os.path.exists(filepath):
141
+ raise click.BadParameter(f"File does not exist: {filepath}")
142
+
143
+ return value
144
+
145
+
146
+ # Define a union type for all supported task types.
147
+ TaskType = Union[
148
+ CaptionTask,
149
+ DedupTask,
150
+ EmbedTask,
151
+ ExtractTask,
152
+ FilterTask,
153
+ InfographicExtractionTask,
154
+ SplitTask,
155
+ StoreEmbedTask,
156
+ StoreTask,
157
+ UDFTask,
158
+ ]
159
+
160
+
161
+ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
162
+ """
163
+ Parse the task options string as JSON.
164
+
165
+ Parameters
166
+ ----------
167
+ task_id : str
168
+ The identifier of the task for which options are being parsed.
169
+ options_str : str
170
+ The string containing JSON options.
171
+
172
+ Returns
173
+ -------
174
+ Dict[str, Any]
175
+ The parsed options as a dictionary.
176
+
177
+ Raises
178
+ ------
179
+ ValueError
180
+ If the JSON string is not well formatted. The error message will indicate the task,
181
+ the error details (e.g., expected property format), and show the input that was provided.
182
+ """
183
+ try:
184
+ options = json.loads(options_str)
185
+
186
+ # Convert string boolean values to actual booleans for extract tasks
187
+ if task_id == "extract":
188
+ boolean_fields = [
189
+ "extract_text",
190
+ "extract_images",
191
+ "extract_tables",
192
+ "extract_charts",
193
+ "extract_infographics",
194
+ "extract_page_as_image",
195
+ ]
196
+ for field in boolean_fields:
197
+ if field in options:
198
+ value = options[field]
199
+ if isinstance(value, str):
200
+ if value.lower() in ("true", "1", "yes", "on"):
201
+ options[field] = True
202
+ elif value.lower() in ("false", "0", "no", "off"):
203
+ options[field] = False
204
+ else:
205
+ raise ValueError(
206
+ f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
207
+ )
208
+
209
+ return options
210
+ except json.JSONDecodeError as e:
211
+ error_message = (
212
+ f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
213
+ f"Input was: {options_str}"
214
+ )
215
+ raise ValueError(error_message)
216
+
217
+
218
+ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[str]) -> Dict[str, TaskType]:
219
+ """
220
+ Validates and processes task definitions provided as strings.
221
+
222
+ Each task definition should be in the format "<task_id>:<json_options>".
223
+ If the separator ':' is missing, an empty JSON options dictionary is assumed.
224
+ The function uses a schema check (via check_schema) for validation and
225
+ instantiates the corresponding task.
226
+
227
+ Parameters
228
+ ----------
229
+ ctx : click.Context
230
+ The Click context.
231
+ param : click.Parameter
232
+ The parameter associated with the task option.
233
+ value : List[str]
234
+ A list of task strings to validate.
235
+
236
+ Returns
237
+ -------
238
+ Dict[str, TaskType]
239
+ A dictionary mapping task IDs to their corresponding task objects.
240
+
241
+ Raises
242
+ ------
243
+ click.BadParameter
244
+ If any task fails validation (including malformed JSON) or if duplicate tasks are detected.
245
+ """
246
+ validated_tasks: Dict[str, TaskType] = {}
247
+ validation_errors: List[str] = []
248
+
249
+ for task_str in value:
250
+ task_split = task_str.split(":", 1)
251
+ if len(task_split) != 2:
252
+ task_id, json_options = task_str, "{}"
253
+ else:
254
+ task_id, json_options = task_split
255
+
256
+ try:
257
+ options: Dict[str, Any] = parse_task_options(task_id, json_options)
258
+
259
+ if task_id == "split":
260
+ task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
261
+ new_task_id = f"{task_id}"
262
+ new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
263
+ elif task_id == "extract":
264
+ # Map CLI parameters to API schema structure
265
+ method = options.pop("extract_method", None)
266
+ if method is None:
267
+ method = "pdfium" # Default fallback
268
+
269
+ # Build params dict for API schema
270
+ params = {k: v for k, v in options.items() if k != "document_type"}
271
+
272
+ # Validate with API schema
273
+ api_options = {
274
+ "document_type": options.get("document_type"),
275
+ "method": method,
276
+ "params": params,
277
+ }
278
+ task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
279
+ new_task_id = f"{task_id}_{task_options.document_type.value}"
280
+
281
+ # Create ExtractTask with original CLI parameters
282
+ extract_task_params = {
283
+ "document_type": task_options.document_type,
284
+ "extract_method": task_options.method,
285
+ **task_options.params,
286
+ }
287
+
288
+ # Start with the main extract task
289
+ new_task = [(new_task_id, ExtractTask(**extract_task_params))]
290
+
291
+ # Add ChartExtractionTask if extract_charts is True
292
+ if task_options.params.get("extract_charts", False):
293
+ from nv_ingest_client.primitives.tasks import ChartExtractionTask
294
+
295
+ chart_task_id = "chart_data_extract"
296
+ chart_params = {"params": {}} # ChartExtractionTask takes params dict
297
+ new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
298
+
299
+ # Add TableExtractionTask if extract_tables is True
300
+ if task_options.params.get("extract_tables", False):
301
+ from nv_ingest_client.primitives.tasks import TableExtractionTask
302
+
303
+ table_task_id = "table_data_extract"
304
+ new_task.append((table_task_id, TableExtractionTask()))
305
+ elif task_id == "store":
306
+ task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
307
+ new_task_id = f"{task_id}"
308
+ new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
309
+ elif task_id == "store_embedding":
310
+ task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
311
+ new_task_id = f"{task_id}"
312
+ new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
313
+ elif task_id == "caption":
314
+ task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
315
+ new_task_id = f"{task_id}"
316
+ # Extract individual parameters from API schema for CaptionTask constructor
317
+ caption_params = {
318
+ "api_key": task_options.api_key,
319
+ "endpoint_url": task_options.endpoint_url,
320
+ "prompt": task_options.prompt,
321
+ "model_name": task_options.model_name,
322
+ }
323
+ new_task = [(new_task_id, CaptionTask(**caption_params))]
324
+ elif task_id == "dedup":
325
+ task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
326
+ new_task_id = f"{task_id}"
327
+ # Extract individual parameters from API schema for DedupTask constructor
328
+ dedup_params = {
329
+ "content_type": task_options.content_type,
330
+ "filter": task_options.params.filter,
331
+ }
332
+ new_task = [(new_task_id, DedupTask(**dedup_params))]
333
+ elif task_id == "filter":
334
+ task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
335
+ new_task_id = f"{task_id}"
336
+ # Extract individual parameters from API schema for FilterTask constructor
337
+ filter_params = {
338
+ "content_type": task_options.content_type,
339
+ "min_size": task_options.params.min_size,
340
+ "max_aspect_ratio": task_options.params.max_aspect_ratio,
341
+ "min_aspect_ratio": task_options.params.min_aspect_ratio,
342
+ "filter": task_options.params.filter,
343
+ }
344
+ new_task = [(new_task_id, FilterTask(**filter_params))]
345
+ elif task_id == "embed":
346
+ task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
347
+ new_task_id = f"{task_id}"
348
+ new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
349
+ elif task_id == "infographic":
350
+ task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
351
+ new_task_id = f"{task_id}"
352
+ new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
353
+ elif task_id == "udf":
354
+ # Validate mutual exclusivity of target_stage and phase
355
+ has_target_stage = "target_stage" in options and options["target_stage"] is not None
356
+ has_phase = "phase" in options and options["phase"] is not None
357
+
358
+ if has_target_stage and has_phase:
359
+ raise ValueError(
360
+ "UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
361
+ )
362
+ elif not has_target_stage and not has_phase:
363
+ raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
364
+
365
+ # Pre-process UDF task options to convert phase names to integers
366
+ if "phase" in options and isinstance(options["phase"], str):
367
+ # Convert phase string to integer using the same logic as UDFTask
368
+ phase_str = options["phase"].upper()
369
+ phase_aliases = {
370
+ "PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
371
+ "PREPROCESSING": PipelinePhase.PRE_PROCESSING,
372
+ "PRE": PipelinePhase.PRE_PROCESSING,
373
+ "EXTRACTION": PipelinePhase.EXTRACTION,
374
+ "EXTRACT": PipelinePhase.EXTRACTION,
375
+ "POST_PROCESSING": PipelinePhase.POST_PROCESSING,
376
+ "POSTPROCESSING": PipelinePhase.POST_PROCESSING,
377
+ "POST": PipelinePhase.POST_PROCESSING,
378
+ "MUTATION": PipelinePhase.MUTATION,
379
+ "MUTATE": PipelinePhase.MUTATION,
380
+ "TRANSFORM": PipelinePhase.TRANSFORM,
381
+ "RESPONSE": PipelinePhase.RESPONSE,
382
+ "RESP": PipelinePhase.RESPONSE,
383
+ }
384
+
385
+ if phase_str in phase_aliases:
386
+ options["phase"] = phase_aliases[phase_str].value
387
+ else:
388
+ raise ValueError(f"Invalid phase name: {options['phase']}")
389
+
390
+ # Try to infer udf_function_name if not provided
391
+ if "udf_function_name" not in options or not options["udf_function_name"]:
392
+ udf_function = options.get("udf_function", "")
393
+ if udf_function:
394
+ inferred_name = infer_udf_function_name(udf_function)
395
+ if inferred_name:
396
+ options["udf_function_name"] = inferred_name
397
+ logger.info(f"Inferred UDF function name: {inferred_name}")
398
+ else:
399
+ raise ValueError(
400
+ f"Could not infer UDF function name from '{udf_function}'. "
401
+ "Please specify 'udf_function_name' explicitly."
402
+ )
403
+
404
+ task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
405
+ new_task_id = f"{task_id}"
406
+ new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
407
+ else:
408
+ raise ValueError(f"Unsupported task type: {task_id}")
409
+
410
+ # Check for duplicate tasks - now allowing multiple tasks of the same type
411
+ if new_task_id in validated_tasks:
412
+ logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
413
+
414
+ logger.debug("Adding task: %s", new_task_id)
415
+ for task_tuple in new_task:
416
+ if task_tuple[0] in validated_tasks:
417
+ # Convert single task to list if needed, then append
418
+ existing_task = validated_tasks[task_tuple[0]]
419
+ if not isinstance(existing_task, list):
420
+ validated_tasks[task_tuple[0]] = [existing_task]
421
+ validated_tasks[task_tuple[0]].append(task_tuple[1])
422
+ else:
423
+ validated_tasks[task_tuple[0]] = task_tuple[1]
424
+ except ValueError as e:
425
+ validation_errors.append(str(e))
426
+
427
+ if validation_errors:
428
+ error_message = "\n".join(validation_errors)
429
+ raise click.BadParameter(error_message)
430
+
431
+ return validated_tasks
432
+
433
+
434
+ def click_validate_batch_size(ctx: click.Context, param: click.Parameter, value: int) -> int:
435
+ """
436
+ Validates that the batch size is at least 1.
437
+
438
+ Parameters
439
+ ----------
440
+ ctx : click.Context
441
+ The Click context.
442
+ param : click.Parameter
443
+ The parameter associated with the batch size option.
444
+ value : int
445
+ The batch size value provided.
446
+
447
+ Returns
448
+ -------
449
+ int
450
+ The validated batch size.
451
+
452
+ Raises
453
+ ------
454
+ click.BadParameter
455
+ If the batch size is less than 1.
456
+ """
457
+ if value < 1:
458
+ raise click.BadParameter("Batch size must be >= 1.")
459
+ return value
460
+
461
+
462
+ def pre_process_dataset(dataset_json: str, shuffle_dataset: bool) -> List[str]:
463
+ """
464
+ Loads a dataset from a JSON file and optionally shuffles the list of files.
465
+
466
+ Parameters
467
+ ----------
468
+ dataset_json : str
469
+ The path to the dataset JSON file.
470
+ shuffle_dataset : bool
471
+ Whether to shuffle the dataset before processing.
472
+
473
+ Returns
474
+ -------
475
+ List[str]
476
+ The list of file paths from the dataset. If 'shuffle_dataset' is True,
477
+ the list will be shuffled.
478
+
479
+ Raises
480
+ ------
481
+ click.BadParameter
482
+ If the dataset file is not found or if its contents are not valid JSON.
483
+ """
484
+ try:
485
+ with open(dataset_json, "r") as f:
486
+ file_source = json.load(f)
487
+ except FileNotFoundError:
488
+ raise click.BadParameter(f"Dataset JSON file not found: {dataset_json}")
489
+ except json.JSONDecodeError:
490
+ raise click.BadParameter(f"Invalid JSON format in file: {dataset_json}")
491
+
492
+ file_source = file_source.get("sampled_files", [])
493
+ if shuffle_dataset:
494
+ random.shuffle(file_source)
495
+
496
+ return file_source
497
+
498
+
499
+ def click_match_and_validate_files(ctx: click.Context, param: click.Parameter, value: List[str]) -> List[str]:
500
+ """
501
+ Matches and validates files based on the provided file source patterns.
502
+
503
+ Parameters
504
+ ----------
505
+ ctx : click.Context
506
+ The Click context.
507
+ param : click.Parameter
508
+ The parameter associated with the file matching option.
509
+ value : List[str]
510
+ A list of file source patterns to match against.
511
+
512
+ Returns
513
+ -------
514
+ List[str]
515
+ A list of matching file paths. If no files match, an empty list is returned.
516
+ """
517
+ if not value:
518
+ return []
519
+
520
+ matching_files = list(generate_matching_files(value))
521
+ if not matching_files:
522
+ logger.warning("No files found matching the specified patterns.")
523
+ return []
524
+
525
+ return matching_files