nv-ingest-client 2025.8.14.dev20250814__tar.gz → 2025.8.16.dev20250816__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client.egg-info → nv_ingest_client-2025.8.16.dev20250816}/PKG-INFO +1 -1
- nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/cli/util/click.py +525 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/interface.py +209 -26
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/nv_ingest_cli.py +16 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/job_spec.py +29 -9
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/__init__.py +6 -4
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/caption.py +10 -16
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/dedup.py +12 -21
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/embed.py +21 -76
- nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/extract.py +241 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/filter.py +21 -27
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/split.py +17 -18
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/task_base.py +74 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/task_factory.py +2 -0
- nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client/primitives/tasks/udf.py +352 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/milvus.py +1 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -3
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/cli/util/click.py +0 -373
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/extract.py +0 -342
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/task_base.py +0 -145
- nv_ingest_client-2025.8.14.dev20250814/src/nv_ingest_client/primitives/tasks/transform.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/LICENSE +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/README.md +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/pyproject.toml +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/setup.cfg +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/client.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/util.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.8.14.dev20250814 → nv_ingest_client-2025.8.16.dev20250816}/src/version.py +0 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from pprint import pprint
|
|
12
|
+
from typing import Union, List, Any, Dict
|
|
13
|
+
|
|
14
|
+
import click
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
17
|
+
from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
|
|
18
|
+
from nv_ingest_client.util.processing import check_schema
|
|
19
|
+
from nv_ingest_client.primitives.tasks import CaptionTask
|
|
20
|
+
from nv_ingest_client.primitives.tasks import DedupTask
|
|
21
|
+
from nv_ingest_client.primitives.tasks import EmbedTask
|
|
22
|
+
from nv_ingest_client.primitives.tasks import ExtractTask
|
|
23
|
+
from nv_ingest_client.primitives.tasks import FilterTask
|
|
24
|
+
from nv_ingest_client.primitives.tasks import InfographicExtractionTask
|
|
25
|
+
from nv_ingest_client.primitives.tasks import SplitTask
|
|
26
|
+
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
27
|
+
from nv_ingest_client.primitives.tasks import StoreTask
|
|
28
|
+
from nv_ingest_client.primitives.tasks import UDFTask
|
|
29
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
30
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
|
|
31
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
32
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
33
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
34
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
|
|
35
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
36
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
37
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
38
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
|
|
39
|
+
from nv_ingest_client.util.util import generate_matching_files
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class LogLevel(str, Enum):
|
|
45
|
+
"""
|
|
46
|
+
Enum for specifying logging levels.
|
|
47
|
+
|
|
48
|
+
Attributes
|
|
49
|
+
----------
|
|
50
|
+
DEBUG : str
|
|
51
|
+
Debug logging level.
|
|
52
|
+
INFO : str
|
|
53
|
+
Informational logging level.
|
|
54
|
+
WARNING : str
|
|
55
|
+
Warning logging level.
|
|
56
|
+
ERROR : str
|
|
57
|
+
Error logging level.
|
|
58
|
+
CRITICAL : str
|
|
59
|
+
Critical logging level.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
DEBUG = "DEBUG"
|
|
63
|
+
INFO = "INFO"
|
|
64
|
+
WARNING = "WARNING"
|
|
65
|
+
ERROR = "ERROR"
|
|
66
|
+
CRITICAL = "CRITICAL"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ClientType(str, Enum):
|
|
70
|
+
"""
|
|
71
|
+
Enum for specifying client types.
|
|
72
|
+
|
|
73
|
+
Attributes
|
|
74
|
+
----------
|
|
75
|
+
REST : str
|
|
76
|
+
Represents a REST client.
|
|
77
|
+
REDIS : str
|
|
78
|
+
Represents a Redis client.
|
|
79
|
+
KAFKA : str
|
|
80
|
+
Represents a Kafka client.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
REST = "REST"
|
|
84
|
+
REDIS = "REDIS"
|
|
85
|
+
KAFKA = "KAFKA"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def debug_print_click_options(ctx: click.Context) -> None:
|
|
89
|
+
"""
|
|
90
|
+
Retrieves all options from the Click context and pretty prints them.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
ctx : click.Context
|
|
95
|
+
The Click context object from which to retrieve the command options.
|
|
96
|
+
"""
|
|
97
|
+
click_options: Dict[str, Any] = {}
|
|
98
|
+
for param in ctx.command.params:
|
|
99
|
+
if isinstance(param, click.Option):
|
|
100
|
+
value = ctx.params[param.name]
|
|
101
|
+
click_options[param.name] = value
|
|
102
|
+
|
|
103
|
+
pprint(click_options)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def click_validate_file_exists(
|
|
107
|
+
ctx: click.Context, param: click.Parameter, value: Union[str, List[str], None]
|
|
108
|
+
) -> List[str]:
|
|
109
|
+
"""
|
|
110
|
+
Validates that the given file(s) exist.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
ctx : click.Context
|
|
115
|
+
The Click context.
|
|
116
|
+
param : click.Parameter
|
|
117
|
+
The parameter associated with the file option.
|
|
118
|
+
value : Union[str, List[str], None]
|
|
119
|
+
A file path or a list of file paths.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
List[str]
|
|
124
|
+
A list of validated file paths.
|
|
125
|
+
|
|
126
|
+
Raises
|
|
127
|
+
------
|
|
128
|
+
click.BadParameter
|
|
129
|
+
If any file path does not exist.
|
|
130
|
+
"""
|
|
131
|
+
if not value:
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
if isinstance(value, str):
|
|
135
|
+
value = [value]
|
|
136
|
+
else:
|
|
137
|
+
value = list(value)
|
|
138
|
+
|
|
139
|
+
for filepath in value:
|
|
140
|
+
if not os.path.exists(filepath):
|
|
141
|
+
raise click.BadParameter(f"File does not exist: {filepath}")
|
|
142
|
+
|
|
143
|
+
return value
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Define a union type for all supported task types.
|
|
147
|
+
TaskType = Union[
|
|
148
|
+
CaptionTask,
|
|
149
|
+
DedupTask,
|
|
150
|
+
EmbedTask,
|
|
151
|
+
ExtractTask,
|
|
152
|
+
FilterTask,
|
|
153
|
+
InfographicExtractionTask,
|
|
154
|
+
SplitTask,
|
|
155
|
+
StoreEmbedTask,
|
|
156
|
+
StoreTask,
|
|
157
|
+
UDFTask,
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
|
|
162
|
+
"""
|
|
163
|
+
Parse the task options string as JSON.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
task_id : str
|
|
168
|
+
The identifier of the task for which options are being parsed.
|
|
169
|
+
options_str : str
|
|
170
|
+
The string containing JSON options.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
Dict[str, Any]
|
|
175
|
+
The parsed options as a dictionary.
|
|
176
|
+
|
|
177
|
+
Raises
|
|
178
|
+
------
|
|
179
|
+
ValueError
|
|
180
|
+
If the JSON string is not well formatted. The error message will indicate the task,
|
|
181
|
+
the error details (e.g., expected property format), and show the input that was provided.
|
|
182
|
+
"""
|
|
183
|
+
try:
|
|
184
|
+
options = json.loads(options_str)
|
|
185
|
+
|
|
186
|
+
# Convert string boolean values to actual booleans for extract tasks
|
|
187
|
+
if task_id == "extract":
|
|
188
|
+
boolean_fields = [
|
|
189
|
+
"extract_text",
|
|
190
|
+
"extract_images",
|
|
191
|
+
"extract_tables",
|
|
192
|
+
"extract_charts",
|
|
193
|
+
"extract_infographics",
|
|
194
|
+
"extract_page_as_image",
|
|
195
|
+
]
|
|
196
|
+
for field in boolean_fields:
|
|
197
|
+
if field in options:
|
|
198
|
+
value = options[field]
|
|
199
|
+
if isinstance(value, str):
|
|
200
|
+
if value.lower() in ("true", "1", "yes", "on"):
|
|
201
|
+
options[field] = True
|
|
202
|
+
elif value.lower() in ("false", "0", "no", "off"):
|
|
203
|
+
options[field] = False
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return options
|
|
210
|
+
except json.JSONDecodeError as e:
|
|
211
|
+
error_message = (
|
|
212
|
+
f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
|
|
213
|
+
f"Input was: {options_str}"
|
|
214
|
+
)
|
|
215
|
+
raise ValueError(error_message)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[str]) -> Dict[str, TaskType]:
|
|
219
|
+
"""
|
|
220
|
+
Validates and processes task definitions provided as strings.
|
|
221
|
+
|
|
222
|
+
Each task definition should be in the format "<task_id>:<json_options>".
|
|
223
|
+
If the separator ':' is missing, an empty JSON options dictionary is assumed.
|
|
224
|
+
The function uses a schema check (via check_schema) for validation and
|
|
225
|
+
instantiates the corresponding task.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
ctx : click.Context
|
|
230
|
+
The Click context.
|
|
231
|
+
param : click.Parameter
|
|
232
|
+
The parameter associated with the task option.
|
|
233
|
+
value : List[str]
|
|
234
|
+
A list of task strings to validate.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
Dict[str, TaskType]
|
|
239
|
+
A dictionary mapping task IDs to their corresponding task objects.
|
|
240
|
+
|
|
241
|
+
Raises
|
|
242
|
+
------
|
|
243
|
+
click.BadParameter
|
|
244
|
+
If any task fails validation (including malformed JSON) or if duplicate tasks are detected.
|
|
245
|
+
"""
|
|
246
|
+
validated_tasks: Dict[str, TaskType] = {}
|
|
247
|
+
validation_errors: List[str] = []
|
|
248
|
+
|
|
249
|
+
for task_str in value:
|
|
250
|
+
task_split = task_str.split(":", 1)
|
|
251
|
+
if len(task_split) != 2:
|
|
252
|
+
task_id, json_options = task_str, "{}"
|
|
253
|
+
else:
|
|
254
|
+
task_id, json_options = task_split
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
options: Dict[str, Any] = parse_task_options(task_id, json_options)
|
|
258
|
+
|
|
259
|
+
if task_id == "split":
|
|
260
|
+
task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
|
|
261
|
+
new_task_id = f"{task_id}"
|
|
262
|
+
new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
|
|
263
|
+
elif task_id == "extract":
|
|
264
|
+
# Map CLI parameters to API schema structure
|
|
265
|
+
method = options.pop("extract_method", None)
|
|
266
|
+
if method is None:
|
|
267
|
+
method = "pdfium" # Default fallback
|
|
268
|
+
|
|
269
|
+
# Build params dict for API schema
|
|
270
|
+
params = {k: v for k, v in options.items() if k != "document_type"}
|
|
271
|
+
|
|
272
|
+
# Validate with API schema
|
|
273
|
+
api_options = {
|
|
274
|
+
"document_type": options.get("document_type"),
|
|
275
|
+
"method": method,
|
|
276
|
+
"params": params,
|
|
277
|
+
}
|
|
278
|
+
task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
|
|
279
|
+
new_task_id = f"{task_id}_{task_options.document_type.value}"
|
|
280
|
+
|
|
281
|
+
# Create ExtractTask with original CLI parameters
|
|
282
|
+
extract_task_params = {
|
|
283
|
+
"document_type": task_options.document_type,
|
|
284
|
+
"extract_method": task_options.method,
|
|
285
|
+
**task_options.params,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# Start with the main extract task
|
|
289
|
+
new_task = [(new_task_id, ExtractTask(**extract_task_params))]
|
|
290
|
+
|
|
291
|
+
# Add ChartExtractionTask if extract_charts is True
|
|
292
|
+
if task_options.params.get("extract_charts", False):
|
|
293
|
+
from nv_ingest_client.primitives.tasks import ChartExtractionTask
|
|
294
|
+
|
|
295
|
+
chart_task_id = "chart_data_extract"
|
|
296
|
+
chart_params = {"params": {}} # ChartExtractionTask takes params dict
|
|
297
|
+
new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
|
|
298
|
+
|
|
299
|
+
# Add TableExtractionTask if extract_tables is True
|
|
300
|
+
if task_options.params.get("extract_tables", False):
|
|
301
|
+
from nv_ingest_client.primitives.tasks import TableExtractionTask
|
|
302
|
+
|
|
303
|
+
table_task_id = "table_data_extract"
|
|
304
|
+
new_task.append((table_task_id, TableExtractionTask()))
|
|
305
|
+
elif task_id == "store":
|
|
306
|
+
task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
|
|
307
|
+
new_task_id = f"{task_id}"
|
|
308
|
+
new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
|
|
309
|
+
elif task_id == "store_embedding":
|
|
310
|
+
task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
|
|
311
|
+
new_task_id = f"{task_id}"
|
|
312
|
+
new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
|
|
313
|
+
elif task_id == "caption":
|
|
314
|
+
task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
|
|
315
|
+
new_task_id = f"{task_id}"
|
|
316
|
+
# Extract individual parameters from API schema for CaptionTask constructor
|
|
317
|
+
caption_params = {
|
|
318
|
+
"api_key": task_options.api_key,
|
|
319
|
+
"endpoint_url": task_options.endpoint_url,
|
|
320
|
+
"prompt": task_options.prompt,
|
|
321
|
+
"model_name": task_options.model_name,
|
|
322
|
+
}
|
|
323
|
+
new_task = [(new_task_id, CaptionTask(**caption_params))]
|
|
324
|
+
elif task_id == "dedup":
|
|
325
|
+
task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
|
|
326
|
+
new_task_id = f"{task_id}"
|
|
327
|
+
# Extract individual parameters from API schema for DedupTask constructor
|
|
328
|
+
dedup_params = {
|
|
329
|
+
"content_type": task_options.content_type,
|
|
330
|
+
"filter": task_options.params.filter,
|
|
331
|
+
}
|
|
332
|
+
new_task = [(new_task_id, DedupTask(**dedup_params))]
|
|
333
|
+
elif task_id == "filter":
|
|
334
|
+
task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
|
|
335
|
+
new_task_id = f"{task_id}"
|
|
336
|
+
# Extract individual parameters from API schema for FilterTask constructor
|
|
337
|
+
filter_params = {
|
|
338
|
+
"content_type": task_options.content_type,
|
|
339
|
+
"min_size": task_options.params.min_size,
|
|
340
|
+
"max_aspect_ratio": task_options.params.max_aspect_ratio,
|
|
341
|
+
"min_aspect_ratio": task_options.params.min_aspect_ratio,
|
|
342
|
+
"filter": task_options.params.filter,
|
|
343
|
+
}
|
|
344
|
+
new_task = [(new_task_id, FilterTask(**filter_params))]
|
|
345
|
+
elif task_id == "embed":
|
|
346
|
+
task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
|
|
347
|
+
new_task_id = f"{task_id}"
|
|
348
|
+
new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
|
|
349
|
+
elif task_id == "infographic":
|
|
350
|
+
task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
|
|
351
|
+
new_task_id = f"{task_id}"
|
|
352
|
+
new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
|
|
353
|
+
elif task_id == "udf":
|
|
354
|
+
# Validate mutual exclusivity of target_stage and phase
|
|
355
|
+
has_target_stage = "target_stage" in options and options["target_stage"] is not None
|
|
356
|
+
has_phase = "phase" in options and options["phase"] is not None
|
|
357
|
+
|
|
358
|
+
if has_target_stage and has_phase:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
|
|
361
|
+
)
|
|
362
|
+
elif not has_target_stage and not has_phase:
|
|
363
|
+
raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
|
|
364
|
+
|
|
365
|
+
# Pre-process UDF task options to convert phase names to integers
|
|
366
|
+
if "phase" in options and isinstance(options["phase"], str):
|
|
367
|
+
# Convert phase string to integer using the same logic as UDFTask
|
|
368
|
+
phase_str = options["phase"].upper()
|
|
369
|
+
phase_aliases = {
|
|
370
|
+
"PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
371
|
+
"PREPROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
372
|
+
"PRE": PipelinePhase.PRE_PROCESSING,
|
|
373
|
+
"EXTRACTION": PipelinePhase.EXTRACTION,
|
|
374
|
+
"EXTRACT": PipelinePhase.EXTRACTION,
|
|
375
|
+
"POST_PROCESSING": PipelinePhase.POST_PROCESSING,
|
|
376
|
+
"POSTPROCESSING": PipelinePhase.POST_PROCESSING,
|
|
377
|
+
"POST": PipelinePhase.POST_PROCESSING,
|
|
378
|
+
"MUTATION": PipelinePhase.MUTATION,
|
|
379
|
+
"MUTATE": PipelinePhase.MUTATION,
|
|
380
|
+
"TRANSFORM": PipelinePhase.TRANSFORM,
|
|
381
|
+
"RESPONSE": PipelinePhase.RESPONSE,
|
|
382
|
+
"RESP": PipelinePhase.RESPONSE,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if phase_str in phase_aliases:
|
|
386
|
+
options["phase"] = phase_aliases[phase_str].value
|
|
387
|
+
else:
|
|
388
|
+
raise ValueError(f"Invalid phase name: {options['phase']}")
|
|
389
|
+
|
|
390
|
+
# Try to infer udf_function_name if not provided
|
|
391
|
+
if "udf_function_name" not in options or not options["udf_function_name"]:
|
|
392
|
+
udf_function = options.get("udf_function", "")
|
|
393
|
+
if udf_function:
|
|
394
|
+
inferred_name = infer_udf_function_name(udf_function)
|
|
395
|
+
if inferred_name:
|
|
396
|
+
options["udf_function_name"] = inferred_name
|
|
397
|
+
logger.info(f"Inferred UDF function name: {inferred_name}")
|
|
398
|
+
else:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"Could not infer UDF function name from '{udf_function}'. "
|
|
401
|
+
"Please specify 'udf_function_name' explicitly."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
|
|
405
|
+
new_task_id = f"{task_id}"
|
|
406
|
+
new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
|
|
407
|
+
else:
|
|
408
|
+
raise ValueError(f"Unsupported task type: {task_id}")
|
|
409
|
+
|
|
410
|
+
# Check for duplicate tasks - now allowing multiple tasks of the same type
|
|
411
|
+
if new_task_id in validated_tasks:
|
|
412
|
+
logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
|
|
413
|
+
|
|
414
|
+
logger.debug("Adding task: %s", new_task_id)
|
|
415
|
+
for task_tuple in new_task:
|
|
416
|
+
if task_tuple[0] in validated_tasks:
|
|
417
|
+
# Convert single task to list if needed, then append
|
|
418
|
+
existing_task = validated_tasks[task_tuple[0]]
|
|
419
|
+
if not isinstance(existing_task, list):
|
|
420
|
+
validated_tasks[task_tuple[0]] = [existing_task]
|
|
421
|
+
validated_tasks[task_tuple[0]].append(task_tuple[1])
|
|
422
|
+
else:
|
|
423
|
+
validated_tasks[task_tuple[0]] = task_tuple[1]
|
|
424
|
+
except ValueError as e:
|
|
425
|
+
validation_errors.append(str(e))
|
|
426
|
+
|
|
427
|
+
if validation_errors:
|
|
428
|
+
error_message = "\n".join(validation_errors)
|
|
429
|
+
raise click.BadParameter(error_message)
|
|
430
|
+
|
|
431
|
+
return validated_tasks
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def click_validate_batch_size(ctx: click.Context, param: click.Parameter, value: int) -> int:
|
|
435
|
+
"""
|
|
436
|
+
Validates that the batch size is at least 1.
|
|
437
|
+
|
|
438
|
+
Parameters
|
|
439
|
+
----------
|
|
440
|
+
ctx : click.Context
|
|
441
|
+
The Click context.
|
|
442
|
+
param : click.Parameter
|
|
443
|
+
The parameter associated with the batch size option.
|
|
444
|
+
value : int
|
|
445
|
+
The batch size value provided.
|
|
446
|
+
|
|
447
|
+
Returns
|
|
448
|
+
-------
|
|
449
|
+
int
|
|
450
|
+
The validated batch size.
|
|
451
|
+
|
|
452
|
+
Raises
|
|
453
|
+
------
|
|
454
|
+
click.BadParameter
|
|
455
|
+
If the batch size is less than 1.
|
|
456
|
+
"""
|
|
457
|
+
if value < 1:
|
|
458
|
+
raise click.BadParameter("Batch size must be >= 1.")
|
|
459
|
+
return value
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def pre_process_dataset(dataset_json: str, shuffle_dataset: bool) -> List[str]:
|
|
463
|
+
"""
|
|
464
|
+
Loads a dataset from a JSON file and optionally shuffles the list of files.
|
|
465
|
+
|
|
466
|
+
Parameters
|
|
467
|
+
----------
|
|
468
|
+
dataset_json : str
|
|
469
|
+
The path to the dataset JSON file.
|
|
470
|
+
shuffle_dataset : bool
|
|
471
|
+
Whether to shuffle the dataset before processing.
|
|
472
|
+
|
|
473
|
+
Returns
|
|
474
|
+
-------
|
|
475
|
+
List[str]
|
|
476
|
+
The list of file paths from the dataset. If 'shuffle_dataset' is True,
|
|
477
|
+
the list will be shuffled.
|
|
478
|
+
|
|
479
|
+
Raises
|
|
480
|
+
------
|
|
481
|
+
click.BadParameter
|
|
482
|
+
If the dataset file is not found or if its contents are not valid JSON.
|
|
483
|
+
"""
|
|
484
|
+
try:
|
|
485
|
+
with open(dataset_json, "r") as f:
|
|
486
|
+
file_source = json.load(f)
|
|
487
|
+
except FileNotFoundError:
|
|
488
|
+
raise click.BadParameter(f"Dataset JSON file not found: {dataset_json}")
|
|
489
|
+
except json.JSONDecodeError:
|
|
490
|
+
raise click.BadParameter(f"Invalid JSON format in file: {dataset_json}")
|
|
491
|
+
|
|
492
|
+
file_source = file_source.get("sampled_files", [])
|
|
493
|
+
if shuffle_dataset:
|
|
494
|
+
random.shuffle(file_source)
|
|
495
|
+
|
|
496
|
+
return file_source
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def click_match_and_validate_files(ctx: click.Context, param: click.Parameter, value: List[str]) -> List[str]:
|
|
500
|
+
"""
|
|
501
|
+
Matches and validates files based on the provided file source patterns.
|
|
502
|
+
|
|
503
|
+
Parameters
|
|
504
|
+
----------
|
|
505
|
+
ctx : click.Context
|
|
506
|
+
The Click context.
|
|
507
|
+
param : click.Parameter
|
|
508
|
+
The parameter associated with the file matching option.
|
|
509
|
+
value : List[str]
|
|
510
|
+
A list of file source patterns to match against.
|
|
511
|
+
|
|
512
|
+
Returns
|
|
513
|
+
-------
|
|
514
|
+
List[str]
|
|
515
|
+
A list of matching file paths. If no files match, an empty list is returned.
|
|
516
|
+
"""
|
|
517
|
+
if not value:
|
|
518
|
+
return []
|
|
519
|
+
|
|
520
|
+
matching_files = list(generate_matching_files(value))
|
|
521
|
+
if not matching_files:
|
|
522
|
+
logger.warning("No files found matching the specified patterns.")
|
|
523
|
+
return []
|
|
524
|
+
|
|
525
|
+
return matching_files
|