nv-ingest-client 2025.8.13.dev20250813__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/client/interface.py +209 -26
- nv_ingest_client/nv_ingest_cli.py +16 -0
- nv_ingest_client/primitives/jobs/job_spec.py +29 -9
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +21 -76
- nv_ingest_client/primitives/tasks/extract.py +67 -168
- nv_ingest_client/primitives/tasks/filter.py +21 -27
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +2 -0
- nv_ingest_client/primitives/tasks/udf.py +352 -0
- nv_ingest_client/util/vdb/milvus.py +1 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/RECORD +25 -27
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -12,23 +12,30 @@ from pprint import pprint
|
|
|
12
12
|
from typing import Union, List, Any, Dict
|
|
13
13
|
|
|
14
14
|
import click
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
17
|
+
from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
|
|
15
18
|
from nv_ingest_client.util.processing import check_schema
|
|
16
19
|
from nv_ingest_client.primitives.tasks import CaptionTask
|
|
17
20
|
from nv_ingest_client.primitives.tasks import DedupTask
|
|
18
21
|
from nv_ingest_client.primitives.tasks import EmbedTask
|
|
19
22
|
from nv_ingest_client.primitives.tasks import ExtractTask
|
|
20
23
|
from nv_ingest_client.primitives.tasks import FilterTask
|
|
24
|
+
from nv_ingest_client.primitives.tasks import InfographicExtractionTask
|
|
21
25
|
from nv_ingest_client.primitives.tasks import SplitTask
|
|
22
26
|
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
23
27
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
24
|
-
from nv_ingest_client.primitives.tasks
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
30
|
-
from
|
|
31
|
-
from
|
|
28
|
+
from nv_ingest_client.primitives.tasks import UDFTask
|
|
29
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
30
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
|
|
31
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
32
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
33
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
34
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
|
|
35
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
36
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
37
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
38
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
|
|
32
39
|
from nv_ingest_client.util.util import generate_matching_files
|
|
33
40
|
|
|
34
41
|
logger = logging.getLogger(__name__)
|
|
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
|
|
|
78
85
|
KAFKA = "KAFKA"
|
|
79
86
|
|
|
80
87
|
|
|
81
|
-
# Example TaskId validation set
|
|
82
|
-
VALID_TASK_IDS = {"task1", "task2", "task3"}
|
|
83
|
-
|
|
84
|
-
_MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
|
|
85
|
-
|
|
86
|
-
|
|
87
88
|
def debug_print_click_options(ctx: click.Context) -> None:
|
|
88
89
|
"""
|
|
89
90
|
Retrieves all options from the Click context and pretty prints them.
|
|
@@ -149,9 +150,11 @@ TaskType = Union[
|
|
|
149
150
|
EmbedTask,
|
|
150
151
|
ExtractTask,
|
|
151
152
|
FilterTask,
|
|
153
|
+
InfographicExtractionTask,
|
|
152
154
|
SplitTask,
|
|
153
155
|
StoreEmbedTask,
|
|
154
156
|
StoreTask,
|
|
157
|
+
UDFTask,
|
|
155
158
|
]
|
|
156
159
|
|
|
157
160
|
|
|
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
|
|
|
178
181
|
the error details (e.g., expected property format), and show the input that was provided.
|
|
179
182
|
"""
|
|
180
183
|
try:
|
|
181
|
-
|
|
184
|
+
options = json.loads(options_str)
|
|
185
|
+
|
|
186
|
+
# Convert string boolean values to actual booleans for extract tasks
|
|
187
|
+
if task_id == "extract":
|
|
188
|
+
boolean_fields = [
|
|
189
|
+
"extract_text",
|
|
190
|
+
"extract_images",
|
|
191
|
+
"extract_tables",
|
|
192
|
+
"extract_charts",
|
|
193
|
+
"extract_infographics",
|
|
194
|
+
"extract_page_as_image",
|
|
195
|
+
]
|
|
196
|
+
for field in boolean_fields:
|
|
197
|
+
if field in options:
|
|
198
|
+
value = options[field]
|
|
199
|
+
if isinstance(value, str):
|
|
200
|
+
if value.lower() in ("true", "1", "yes", "on"):
|
|
201
|
+
options[field] = True
|
|
202
|
+
elif value.lower() in ("false", "0", "no", "off"):
|
|
203
|
+
options[field] = False
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return options
|
|
182
210
|
except json.JSONDecodeError as e:
|
|
183
211
|
error_message = (
|
|
184
212
|
f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
|
|
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
|
|
|
229
257
|
options: Dict[str, Any] = parse_task_options(task_id, json_options)
|
|
230
258
|
|
|
231
259
|
if task_id == "split":
|
|
232
|
-
task_options = check_schema(
|
|
260
|
+
task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
|
|
233
261
|
new_task_id = f"{task_id}"
|
|
234
262
|
new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
|
|
235
263
|
elif task_id == "extract":
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
264
|
+
# Map CLI parameters to API schema structure
|
|
265
|
+
method = options.pop("extract_method", None)
|
|
266
|
+
if method is None:
|
|
267
|
+
method = "pdfium" # Default fallback
|
|
268
|
+
|
|
269
|
+
# Build params dict for API schema
|
|
270
|
+
params = {k: v for k, v in options.items() if k != "document_type"}
|
|
271
|
+
|
|
272
|
+
# Validate with API schema
|
|
273
|
+
api_options = {
|
|
274
|
+
"document_type": options.get("document_type"),
|
|
275
|
+
"method": method,
|
|
276
|
+
"params": params,
|
|
277
|
+
}
|
|
278
|
+
task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
|
|
279
|
+
new_task_id = f"{task_id}_{task_options.document_type.value}"
|
|
280
|
+
|
|
281
|
+
# Create ExtractTask with original CLI parameters
|
|
282
|
+
extract_task_params = {
|
|
283
|
+
"document_type": task_options.document_type,
|
|
284
|
+
"extract_method": task_options.method,
|
|
285
|
+
**task_options.params,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# Start with the main extract task
|
|
289
|
+
new_task = [(new_task_id, ExtractTask(**extract_task_params))]
|
|
290
|
+
|
|
291
|
+
# Add ChartExtractionTask if extract_charts is True
|
|
292
|
+
if task_options.params.get("extract_charts", False):
|
|
293
|
+
from nv_ingest_client.primitives.tasks import ChartExtractionTask
|
|
294
|
+
|
|
295
|
+
chart_task_id = "chart_data_extract"
|
|
296
|
+
chart_params = {"params": {}} # ChartExtractionTask takes params dict
|
|
297
|
+
new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
|
|
298
|
+
|
|
299
|
+
# Add TableExtractionTask if extract_tables is True
|
|
300
|
+
if task_options.params.get("extract_tables", False):
|
|
301
|
+
from nv_ingest_client.primitives.tasks import TableExtractionTask
|
|
302
|
+
|
|
303
|
+
table_task_id = "table_data_extract"
|
|
304
|
+
new_task.append((table_task_id, TableExtractionTask()))
|
|
239
305
|
elif task_id == "store":
|
|
240
|
-
task_options = check_schema(
|
|
306
|
+
task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
|
|
241
307
|
new_task_id = f"{task_id}"
|
|
242
308
|
new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
|
|
243
309
|
elif task_id == "store_embedding":
|
|
244
|
-
task_options = check_schema(
|
|
310
|
+
task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
|
|
245
311
|
new_task_id = f"{task_id}"
|
|
246
312
|
new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
|
|
247
313
|
elif task_id == "caption":
|
|
248
|
-
task_options = check_schema(
|
|
314
|
+
task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
|
|
249
315
|
new_task_id = f"{task_id}"
|
|
250
|
-
|
|
316
|
+
# Extract individual parameters from API schema for CaptionTask constructor
|
|
317
|
+
caption_params = {
|
|
318
|
+
"api_key": task_options.api_key,
|
|
319
|
+
"endpoint_url": task_options.endpoint_url,
|
|
320
|
+
"prompt": task_options.prompt,
|
|
321
|
+
"model_name": task_options.model_name,
|
|
322
|
+
}
|
|
323
|
+
new_task = [(new_task_id, CaptionTask(**caption_params))]
|
|
251
324
|
elif task_id == "dedup":
|
|
252
|
-
task_options = check_schema(
|
|
325
|
+
task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
|
|
253
326
|
new_task_id = f"{task_id}"
|
|
254
|
-
|
|
327
|
+
# Extract individual parameters from API schema for DedupTask constructor
|
|
328
|
+
dedup_params = {
|
|
329
|
+
"content_type": task_options.content_type,
|
|
330
|
+
"filter": task_options.params.filter,
|
|
331
|
+
}
|
|
332
|
+
new_task = [(new_task_id, DedupTask(**dedup_params))]
|
|
255
333
|
elif task_id == "filter":
|
|
256
|
-
task_options = check_schema(
|
|
334
|
+
task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
|
|
257
335
|
new_task_id = f"{task_id}"
|
|
258
|
-
|
|
336
|
+
# Extract individual parameters from API schema for FilterTask constructor
|
|
337
|
+
filter_params = {
|
|
338
|
+
"content_type": task_options.content_type,
|
|
339
|
+
"min_size": task_options.params.min_size,
|
|
340
|
+
"max_aspect_ratio": task_options.params.max_aspect_ratio,
|
|
341
|
+
"min_aspect_ratio": task_options.params.min_aspect_ratio,
|
|
342
|
+
"filter": task_options.params.filter,
|
|
343
|
+
}
|
|
344
|
+
new_task = [(new_task_id, FilterTask(**filter_params))]
|
|
259
345
|
elif task_id == "embed":
|
|
260
|
-
task_options = check_schema(
|
|
346
|
+
task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
|
|
261
347
|
new_task_id = f"{task_id}"
|
|
262
348
|
new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
|
|
349
|
+
elif task_id == "infographic":
|
|
350
|
+
task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
|
|
351
|
+
new_task_id = f"{task_id}"
|
|
352
|
+
new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
|
|
353
|
+
elif task_id == "udf":
|
|
354
|
+
# Validate mutual exclusivity of target_stage and phase
|
|
355
|
+
has_target_stage = "target_stage" in options and options["target_stage"] is not None
|
|
356
|
+
has_phase = "phase" in options and options["phase"] is not None
|
|
357
|
+
|
|
358
|
+
if has_target_stage and has_phase:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
|
|
361
|
+
)
|
|
362
|
+
elif not has_target_stage and not has_phase:
|
|
363
|
+
raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
|
|
364
|
+
|
|
365
|
+
# Pre-process UDF task options to convert phase names to integers
|
|
366
|
+
if "phase" in options and isinstance(options["phase"], str):
|
|
367
|
+
# Convert phase string to integer using the same logic as UDFTask
|
|
368
|
+
phase_str = options["phase"].upper()
|
|
369
|
+
phase_aliases = {
|
|
370
|
+
"PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
371
|
+
"PREPROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
372
|
+
"PRE": PipelinePhase.PRE_PROCESSING,
|
|
373
|
+
"EXTRACTION": PipelinePhase.EXTRACTION,
|
|
374
|
+
"EXTRACT": PipelinePhase.EXTRACTION,
|
|
375
|
+
"POST_PROCESSING": PipelinePhase.POST_PROCESSING,
|
|
376
|
+
"POSTPROCESSING": PipelinePhase.POST_PROCESSING,
|
|
377
|
+
"POST": PipelinePhase.POST_PROCESSING,
|
|
378
|
+
"MUTATION": PipelinePhase.MUTATION,
|
|
379
|
+
"MUTATE": PipelinePhase.MUTATION,
|
|
380
|
+
"TRANSFORM": PipelinePhase.TRANSFORM,
|
|
381
|
+
"RESPONSE": PipelinePhase.RESPONSE,
|
|
382
|
+
"RESP": PipelinePhase.RESPONSE,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if phase_str in phase_aliases:
|
|
386
|
+
options["phase"] = phase_aliases[phase_str].value
|
|
387
|
+
else:
|
|
388
|
+
raise ValueError(f"Invalid phase name: {options['phase']}")
|
|
389
|
+
|
|
390
|
+
# Try to infer udf_function_name if not provided
|
|
391
|
+
if "udf_function_name" not in options or not options["udf_function_name"]:
|
|
392
|
+
udf_function = options.get("udf_function", "")
|
|
393
|
+
if udf_function:
|
|
394
|
+
inferred_name = infer_udf_function_name(udf_function)
|
|
395
|
+
if inferred_name:
|
|
396
|
+
options["udf_function_name"] = inferred_name
|
|
397
|
+
logger.info(f"Inferred UDF function name: {inferred_name}")
|
|
398
|
+
else:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"Could not infer UDF function name from '{udf_function}'. "
|
|
401
|
+
"Please specify 'udf_function_name' explicitly."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
|
|
405
|
+
new_task_id = f"{task_id}"
|
|
406
|
+
new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
|
|
263
407
|
else:
|
|
264
408
|
raise ValueError(f"Unsupported task type: {task_id}")
|
|
265
409
|
|
|
410
|
+
# Check for duplicate tasks - now allowing multiple tasks of the same type
|
|
266
411
|
if new_task_id in validated_tasks:
|
|
267
|
-
|
|
412
|
+
logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
|
|
268
413
|
|
|
269
414
|
logger.debug("Adding task: %s", new_task_id)
|
|
270
415
|
for task_tuple in new_task:
|
|
271
|
-
|
|
416
|
+
if task_tuple[0] in validated_tasks:
|
|
417
|
+
# Convert single task to list if needed, then append
|
|
418
|
+
existing_task = validated_tasks[task_tuple[0]]
|
|
419
|
+
if not isinstance(existing_task, list):
|
|
420
|
+
validated_tasks[task_tuple[0]] = [existing_task]
|
|
421
|
+
validated_tasks[task_tuple[0]].append(task_tuple[1])
|
|
422
|
+
else:
|
|
423
|
+
validated_tasks[task_tuple[0]] = task_tuple[1]
|
|
272
424
|
except ValueError as e:
|
|
273
425
|
validation_errors.append(str(e))
|
|
274
426
|
|
|
@@ -27,6 +27,16 @@ from typing import Union
|
|
|
27
27
|
from urllib.parse import urlparse
|
|
28
28
|
|
|
29
29
|
import fsspec
|
|
30
|
+
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
31
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
32
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
|
|
33
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
34
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
35
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
36
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
37
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
38
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
39
|
+
from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
|
|
30
40
|
from nv_ingest_client.client.client import NvIngestClient
|
|
31
41
|
from nv_ingest_client.client.util.processing import get_valid_filename
|
|
32
42
|
from nv_ingest_client.client.util.processing import save_document_results_to_jsonl
|
|
@@ -38,16 +48,9 @@ from nv_ingest_client.primitives.tasks import EmbedTask
|
|
|
38
48
|
from nv_ingest_client.primitives.tasks import ExtractTask
|
|
39
49
|
from nv_ingest_client.primitives.tasks import FilterTask
|
|
40
50
|
from nv_ingest_client.primitives.tasks import SplitTask
|
|
41
|
-
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
42
51
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
43
|
-
from nv_ingest_client.primitives.tasks
|
|
44
|
-
from nv_ingest_client.primitives.tasks
|
|
45
|
-
from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
|
|
46
|
-
from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
|
|
47
|
-
from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
|
|
48
|
-
from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
|
|
49
|
-
from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
|
|
50
|
-
from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
|
|
52
|
+
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
53
|
+
from nv_ingest_client.primitives.tasks import UDFTask
|
|
51
54
|
from nv_ingest_client.util.processing import check_schema
|
|
52
55
|
from nv_ingest_client.util.system import ensure_directory_with_permissions
|
|
53
56
|
from nv_ingest_client.util.util import filter_function_kwargs
|
|
@@ -436,7 +439,7 @@ class Ingestor:
|
|
|
436
439
|
|
|
437
440
|
final_results_payload_list: Union[List[List[Dict[str, Any]]], List[LazyLoadedList]] = []
|
|
438
441
|
|
|
439
|
-
# Lock for thread-safe
|
|
442
|
+
# Lock for thread-safe appending to final_results_payload_list by I/O tasks
|
|
440
443
|
results_lock = threading.Lock() if self._output_config else None
|
|
441
444
|
|
|
442
445
|
io_executor: Optional[ThreadPoolExecutor] = None
|
|
@@ -698,8 +701,23 @@ class Ingestor:
|
|
|
698
701
|
Ingestor
|
|
699
702
|
Returns self for chaining.
|
|
700
703
|
"""
|
|
701
|
-
|
|
702
|
-
|
|
704
|
+
# Extract content_type and build params dict for API schema
|
|
705
|
+
content_type = kwargs.pop("content_type", "text") # Default to "text" if not specified
|
|
706
|
+
params = kwargs # Remaining parameters go into params dict
|
|
707
|
+
|
|
708
|
+
# Validate with API schema
|
|
709
|
+
api_options = {
|
|
710
|
+
"content_type": content_type,
|
|
711
|
+
"params": params,
|
|
712
|
+
}
|
|
713
|
+
task_options = check_schema(IngestTaskDedupSchema, api_options, "dedup", json.dumps(api_options))
|
|
714
|
+
|
|
715
|
+
# Extract individual parameters from API schema for DedupTask constructor
|
|
716
|
+
dedup_params = {
|
|
717
|
+
"content_type": task_options.content_type,
|
|
718
|
+
"filter": task_options.params.filter,
|
|
719
|
+
}
|
|
720
|
+
dedup_task = DedupTask(**dedup_params)
|
|
703
721
|
self._job_specs.add_task(dedup_task)
|
|
704
722
|
|
|
705
723
|
return self
|
|
@@ -719,8 +737,14 @@ class Ingestor:
|
|
|
719
737
|
Ingestor
|
|
720
738
|
Returns self for chaining.
|
|
721
739
|
"""
|
|
722
|
-
|
|
723
|
-
|
|
740
|
+
# Filter out deprecated parameters before API schema validation
|
|
741
|
+
# The EmbedTask constructor handles these deprecated parameters with warnings
|
|
742
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["text", "tables"]}
|
|
743
|
+
|
|
744
|
+
_ = check_schema(IngestTaskEmbedSchema, filtered_kwargs, "embed", json.dumps(filtered_kwargs))
|
|
745
|
+
|
|
746
|
+
# Pass original kwargs to EmbedTask constructor so it can handle deprecated parameters
|
|
747
|
+
embed_task = EmbedTask(**kwargs)
|
|
724
748
|
self._job_specs.add_task(embed_task)
|
|
725
749
|
|
|
726
750
|
return self
|
|
@@ -767,9 +791,52 @@ class Ingestor:
|
|
|
767
791
|
extract_page_as_image=extract_page_as_image,
|
|
768
792
|
**kwargs,
|
|
769
793
|
)
|
|
770
|
-
task_options = check_schema(ExtractTaskSchema, task_options, "extract", json.dumps(task_options))
|
|
771
794
|
|
|
772
|
-
|
|
795
|
+
# Extract method from task_options for API schema
|
|
796
|
+
method = task_options.pop("extract_method", None)
|
|
797
|
+
if method is None:
|
|
798
|
+
# Let ExtractTask constructor handle default method selection
|
|
799
|
+
method = "pdfium" # Default fallback
|
|
800
|
+
|
|
801
|
+
# Build params dict for API schema
|
|
802
|
+
params = {k: v for k, v in task_options.items() if k != "document_type"}
|
|
803
|
+
|
|
804
|
+
# Map document type to API schema expected values
|
|
805
|
+
# Handle common file extension to DocumentTypeEnum mapping
|
|
806
|
+
document_type_mapping = {
|
|
807
|
+
"txt": "text",
|
|
808
|
+
"md": "text",
|
|
809
|
+
"sh": "text",
|
|
810
|
+
"json": "text",
|
|
811
|
+
"jpg": "jpeg",
|
|
812
|
+
"jpeg": "jpeg",
|
|
813
|
+
"png": "png",
|
|
814
|
+
"pdf": "pdf",
|
|
815
|
+
"docx": "docx",
|
|
816
|
+
"pptx": "pptx",
|
|
817
|
+
"html": "html",
|
|
818
|
+
"bmp": "bmp",
|
|
819
|
+
"tiff": "tiff",
|
|
820
|
+
"svg": "svg",
|
|
821
|
+
"mp3": "mp3",
|
|
822
|
+
"wav": "wav",
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
# Use mapped document type for API schema validation
|
|
826
|
+
api_document_type = document_type_mapping.get(document_type.lower(), document_type)
|
|
827
|
+
|
|
828
|
+
# Validate with API schema
|
|
829
|
+
api_task_options = {
|
|
830
|
+
"document_type": api_document_type,
|
|
831
|
+
"method": method,
|
|
832
|
+
"params": params,
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
check_schema(IngestTaskExtractSchema, api_task_options, "extract", json.dumps(api_task_options))
|
|
836
|
+
|
|
837
|
+
# Create ExtractTask with mapped document type for API schema compatibility
|
|
838
|
+
extract_task_params = {"document_type": api_document_type, "extract_method": method, **params}
|
|
839
|
+
extract_task = ExtractTask(**extract_task_params)
|
|
773
840
|
self._job_specs.add_task(extract_task, document_type=document_type)
|
|
774
841
|
|
|
775
842
|
return self
|
|
@@ -789,8 +856,27 @@ class Ingestor:
|
|
|
789
856
|
Ingestor
|
|
790
857
|
Returns self for chaining.
|
|
791
858
|
"""
|
|
792
|
-
|
|
793
|
-
|
|
859
|
+
# Restructure parameters to match API schema structure
|
|
860
|
+
params_fields = {"min_size", "max_aspect_ratio", "min_aspect_ratio", "filter"}
|
|
861
|
+
params = {k: v for k, v in kwargs.items() if k in params_fields}
|
|
862
|
+
top_level = {k: v for k, v in kwargs.items() if k not in params_fields}
|
|
863
|
+
|
|
864
|
+
# Build API schema structure
|
|
865
|
+
api_kwargs = top_level.copy()
|
|
866
|
+
if params:
|
|
867
|
+
api_kwargs["params"] = params
|
|
868
|
+
|
|
869
|
+
task_options = check_schema(IngestTaskFilterSchema, api_kwargs, "filter", json.dumps(api_kwargs))
|
|
870
|
+
|
|
871
|
+
# Extract individual parameters from API schema for FilterTask constructor
|
|
872
|
+
filter_params = {
|
|
873
|
+
"content_type": task_options.content_type,
|
|
874
|
+
"min_size": task_options.params.min_size,
|
|
875
|
+
"max_aspect_ratio": task_options.params.max_aspect_ratio,
|
|
876
|
+
"min_aspect_ratio": task_options.params.min_aspect_ratio,
|
|
877
|
+
"filter": task_options.params.filter,
|
|
878
|
+
}
|
|
879
|
+
filter_task = FilterTask(**filter_params)
|
|
794
880
|
self._job_specs.add_task(filter_task)
|
|
795
881
|
|
|
796
882
|
return self
|
|
@@ -810,7 +896,7 @@ class Ingestor:
|
|
|
810
896
|
Ingestor
|
|
811
897
|
Returns self for chaining.
|
|
812
898
|
"""
|
|
813
|
-
task_options = check_schema(
|
|
899
|
+
task_options = check_schema(IngestTaskSplitSchema, kwargs, "split", json.dumps(kwargs))
|
|
814
900
|
extract_task = SplitTask(**task_options.model_dump())
|
|
815
901
|
self._job_specs.add_task(extract_task)
|
|
816
902
|
|
|
@@ -831,8 +917,24 @@ class Ingestor:
|
|
|
831
917
|
Ingestor
|
|
832
918
|
Returns self for chaining.
|
|
833
919
|
"""
|
|
834
|
-
|
|
835
|
-
|
|
920
|
+
# Handle parameter name mapping: store_method -> method for API schema
|
|
921
|
+
if "store_method" in kwargs:
|
|
922
|
+
kwargs["method"] = kwargs.pop("store_method")
|
|
923
|
+
|
|
924
|
+
# Provide default method if not specified (matching client StoreTask behavior)
|
|
925
|
+
if "method" not in kwargs:
|
|
926
|
+
kwargs["method"] = "minio"
|
|
927
|
+
|
|
928
|
+
task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
|
|
929
|
+
|
|
930
|
+
# Map API schema fields back to StoreTask constructor parameters
|
|
931
|
+
store_params = {
|
|
932
|
+
"structured": task_options.structured,
|
|
933
|
+
"images": task_options.images,
|
|
934
|
+
"store_method": task_options.method, # Map method back to store_method
|
|
935
|
+
"params": task_options.params,
|
|
936
|
+
}
|
|
937
|
+
store_task = StoreTask(**store_params)
|
|
836
938
|
self._job_specs.add_task(store_task)
|
|
837
939
|
|
|
838
940
|
return self
|
|
@@ -840,24 +942,97 @@ class Ingestor:
|
|
|
840
942
|
@ensure_job_specs
|
|
841
943
|
def store_embed(self, **kwargs: Any) -> "Ingestor":
|
|
842
944
|
"""
|
|
843
|
-
Adds a
|
|
945
|
+
Adds a StoreEmbedTask to the batch job specification.
|
|
844
946
|
|
|
845
947
|
Parameters
|
|
846
948
|
----------
|
|
847
949
|
kwargs : dict
|
|
848
|
-
Parameters specific to the
|
|
950
|
+
Parameters specific to the StoreEmbedTask.
|
|
849
951
|
|
|
850
952
|
Returns
|
|
851
953
|
-------
|
|
852
954
|
Ingestor
|
|
853
955
|
Returns self for chaining.
|
|
854
956
|
"""
|
|
855
|
-
task_options = check_schema(
|
|
957
|
+
task_options = check_schema(IngestTaskStoreEmbedSchema, kwargs, "store_embedding", json.dumps(kwargs))
|
|
856
958
|
store_task = StoreEmbedTask(**task_options.model_dump())
|
|
857
959
|
self._job_specs.add_task(store_task)
|
|
858
960
|
|
|
859
961
|
return self
|
|
860
962
|
|
|
963
|
+
def udf(
|
|
964
|
+
self,
|
|
965
|
+
udf_function: str,
|
|
966
|
+
udf_function_name: Optional[str] = None,
|
|
967
|
+
phase: Optional[Union[PipelinePhase, int, str]] = None,
|
|
968
|
+
target_stage: Optional[str] = None,
|
|
969
|
+
run_before: bool = False,
|
|
970
|
+
run_after: bool = False,
|
|
971
|
+
) -> "Ingestor":
|
|
972
|
+
"""
|
|
973
|
+
Adds a UDFTask to the batch job specification.
|
|
974
|
+
|
|
975
|
+
Parameters
|
|
976
|
+
----------
|
|
977
|
+
udf_function : str
|
|
978
|
+
UDF specification. Supports three formats:
|
|
979
|
+
1. Inline function: 'def my_func(control_message): ...'
|
|
980
|
+
2. Import path: 'my_module.my_function'
|
|
981
|
+
3. File path: '/path/to/file.py:function_name'
|
|
982
|
+
udf_function_name : str, optional
|
|
983
|
+
Name of the function to execute from the UDF specification.
|
|
984
|
+
If not provided, attempts to infer from udf_function.
|
|
985
|
+
phase : Union[PipelinePhase, int, str], optional
|
|
986
|
+
Pipeline phase to execute UDF. Accepts phase names ('extract', 'split', 'embed', 'response')
|
|
987
|
+
or numbers (1-4). Cannot be used with target_stage.
|
|
988
|
+
target_stage : str, optional
|
|
989
|
+
Specific stage name to target for UDF execution. Cannot be used with phase.
|
|
990
|
+
run_before : bool, optional
|
|
991
|
+
If True and target_stage is specified, run UDF before the target stage. Default: False.
|
|
992
|
+
run_after : bool, optional
|
|
993
|
+
If True and target_stage is specified, run UDF after the target stage. Default: False.
|
|
994
|
+
|
|
995
|
+
Returns
|
|
996
|
+
-------
|
|
997
|
+
Ingestor
|
|
998
|
+
Returns self for chaining.
|
|
999
|
+
|
|
1000
|
+
Raises
|
|
1001
|
+
------
|
|
1002
|
+
ValueError
|
|
1003
|
+
If udf_function_name cannot be inferred and is not provided explicitly,
|
|
1004
|
+
or if both phase and target_stage are specified, or if neither is specified.
|
|
1005
|
+
"""
|
|
1006
|
+
# Validate mutual exclusivity of phase and target_stage
|
|
1007
|
+
if phase is not None and target_stage is not None:
|
|
1008
|
+
raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
|
|
1009
|
+
elif phase is None and target_stage is None:
|
|
1010
|
+
# Default to response phase for backward compatibility
|
|
1011
|
+
phase = PipelinePhase.RESPONSE
|
|
1012
|
+
|
|
1013
|
+
# Try to infer udf_function_name if not provided
|
|
1014
|
+
if udf_function_name is None:
|
|
1015
|
+
udf_function_name = infer_udf_function_name(udf_function)
|
|
1016
|
+
if udf_function_name is None:
|
|
1017
|
+
raise ValueError(
|
|
1018
|
+
f"Could not infer UDF function name from '{udf_function}'. "
|
|
1019
|
+
"Please specify 'udf_function_name' explicitly."
|
|
1020
|
+
)
|
|
1021
|
+
logger.info(f"Inferred UDF function name: {udf_function_name}")
|
|
1022
|
+
|
|
1023
|
+
# Use UDFTask constructor with explicit parameters
|
|
1024
|
+
udf_task = UDFTask(
|
|
1025
|
+
udf_function=udf_function,
|
|
1026
|
+
udf_function_name=udf_function_name,
|
|
1027
|
+
phase=phase,
|
|
1028
|
+
target_stage=target_stage,
|
|
1029
|
+
run_before=run_before,
|
|
1030
|
+
run_after=run_after,
|
|
1031
|
+
)
|
|
1032
|
+
self._job_specs.add_task(udf_task)
|
|
1033
|
+
|
|
1034
|
+
return self
|
|
1035
|
+
|
|
861
1036
|
def vdb_upload(self, purge_results_after_upload: bool = True, **kwargs: Any) -> "Ingestor":
|
|
862
1037
|
"""
|
|
863
1038
|
Adds a VdbUploadTask to the batch job specification.
|
|
@@ -986,8 +1161,16 @@ class Ingestor:
|
|
|
986
1161
|
Ingestor
|
|
987
1162
|
Returns self for chaining.
|
|
988
1163
|
"""
|
|
989
|
-
task_options = check_schema(
|
|
990
|
-
|
|
1164
|
+
task_options = check_schema(IngestTaskCaptionSchema, kwargs, "caption", json.dumps(kwargs))
|
|
1165
|
+
|
|
1166
|
+
# Extract individual parameters from API schema for CaptionTask constructor
|
|
1167
|
+
caption_params = {
|
|
1168
|
+
"api_key": task_options.api_key,
|
|
1169
|
+
"endpoint_url": task_options.endpoint_url,
|
|
1170
|
+
"prompt": task_options.prompt,
|
|
1171
|
+
"model_name": task_options.model_name,
|
|
1172
|
+
}
|
|
1173
|
+
caption_task = CaptionTask(**caption_params)
|
|
991
1174
|
self._job_specs.add_task(caption_task)
|
|
992
1175
|
|
|
993
1176
|
return self
|
|
@@ -169,6 +169,22 @@ Tasks and Options:
|
|
|
169
169
|
- split_length (int): Segment length. No default.
|
|
170
170
|
- split_overlap (int): Segment overlap. No default.
|
|
171
171
|
\b
|
|
172
|
+
- udf: Executes user-defined functions (UDFs) for custom processing logic.
|
|
173
|
+
Options:
|
|
174
|
+
- udf_function (str): UDF specification. Supports three formats:
|
|
175
|
+
1. Inline function: 'def my_func(control_message): ...'
|
|
176
|
+
2. Import path: 'my_module.my_function'
|
|
177
|
+
3. File path: '/path/to/file.py:function_name' or '/path/to/file.py' (assumes 'process' function)
|
|
178
|
+
- udf_function_name (str): Name of the function to execute from the UDF specification. Required.
|
|
179
|
+
- target_stage (str): Specific pipeline stage name to target for UDF execution (e.g.,
|
|
180
|
+
'text_extractor', 'text_embedder', 'image_extractor'). Cannot be used with phase.
|
|
181
|
+
- run_before (bool): If True and target_stage is specified, run UDF before the target stage. Default: False.
|
|
182
|
+
- run_after (bool): If True and target_stage is specified, run UDF after the target stage. Default: False.
|
|
183
|
+
Examples:
|
|
184
|
+
--task 'udf:{"udf_function": "my_file.py:my_func", "target_stage": "text_embedder", "run_before": true}'
|
|
185
|
+
--task 'udf:{"udf_function": "def process(cm): return cm",
|
|
186
|
+
"target_stage": "image_extractor", "run_after": true}'
|
|
187
|
+
\b
|
|
172
188
|
Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
|
|
173
189
|
""",
|
|
174
190
|
)
|