nv-ingest-client 2025.10.26.dev20251026__tar.gz → 2025.10.28.dev20251028__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.10.26.dev20251026/src/nv_ingest_client.egg-info → nv_ingest_client-2025.10.28.dev20251028}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/client/client.py +83 -6
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/client/interface.py +40 -27
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/embed.py +16 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/vdb/milvus.py +4 -4
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/LICENSE +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/README.md +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/pyproject.toml +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/client/ingest_job_handler.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/util.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/version.py +0 -0
|
@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
|
|
|
44
44
|
logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute resident_time entries from entry/exit pairs if not already present.
|
|
50
|
+
|
|
51
|
+
This ensures consistency between split jobs (where server computes resident_time)
|
|
52
|
+
and non-split jobs (where we compute it client-side).
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
trace_dict : Dict[str, Any]
|
|
57
|
+
Trace dictionary with entry/exit pairs
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Dict[str, Any]
|
|
62
|
+
Trace dictionary with resident_time entries added
|
|
63
|
+
"""
|
|
64
|
+
if not trace_dict or not isinstance(trace_dict, dict):
|
|
65
|
+
return trace_dict
|
|
66
|
+
|
|
67
|
+
# Check if resident_time already exists (server-computed for split jobs)
|
|
68
|
+
has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
|
|
69
|
+
if has_resident:
|
|
70
|
+
return trace_dict # Already computed by server
|
|
71
|
+
|
|
72
|
+
# Compute resident_time from entry/exit pairs
|
|
73
|
+
result = dict(trace_dict)
|
|
74
|
+
stages = set()
|
|
75
|
+
|
|
76
|
+
# Find all unique stages
|
|
77
|
+
for key in trace_dict:
|
|
78
|
+
if key.startswith("trace::entry::"):
|
|
79
|
+
stages.add(key.replace("trace::entry::", ""))
|
|
80
|
+
|
|
81
|
+
# Compute resident_time for each stage
|
|
82
|
+
for stage in stages:
|
|
83
|
+
entry_key = f"trace::entry::{stage}"
|
|
84
|
+
exit_key = f"trace::exit::{stage}"
|
|
85
|
+
if entry_key in trace_dict and exit_key in trace_dict:
|
|
86
|
+
result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
47
91
|
class DataDecodeException(Exception):
|
|
48
92
|
"""
|
|
49
93
|
Exception raised for errors in decoding data.
|
|
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
|
|
|
87
131
|
stream_to_callback_only: bool,
|
|
88
132
|
return_full_response: bool,
|
|
89
133
|
verbose: bool = False,
|
|
134
|
+
return_traces: bool = False,
|
|
90
135
|
):
|
|
91
136
|
"""
|
|
92
137
|
Initializes the concurrent processor.
|
|
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
|
|
|
120
165
|
initiating job submission or fetching fails for a batch.
|
|
121
166
|
verbose : bool, optional
|
|
122
167
|
If True, enables detailed debug logging. Default is False.
|
|
168
|
+
return_traces : bool, optional
|
|
169
|
+
If True, parent-level trace data for each completed job is stored.
|
|
123
170
|
|
|
124
171
|
Raises
|
|
125
172
|
------
|
|
@@ -142,12 +189,14 @@ class _ConcurrentProcessor:
|
|
|
142
189
|
self.stream_to_callback_only = stream_to_callback_only
|
|
143
190
|
self.return_full_response = return_full_response
|
|
144
191
|
self.verbose = verbose
|
|
192
|
+
self.return_traces = return_traces
|
|
145
193
|
|
|
146
194
|
# State variables managed across batch cycles
|
|
147
195
|
self.retry_job_ids: List[str] = []
|
|
148
196
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
149
197
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
150
198
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
199
|
+
self.traces: List[Optional[Dict[str, Any]]] = []
|
|
151
200
|
|
|
152
201
|
# --- Initial Checks ---
|
|
153
202
|
if not self.job_queue_id:
|
|
@@ -247,6 +296,14 @@ class _ConcurrentProcessor:
|
|
|
247
296
|
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
248
297
|
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
249
298
|
|
|
299
|
+
# Extract trace data for all successful (non-failed) jobs
|
|
300
|
+
if self.return_traces and not is_failed:
|
|
301
|
+
trace_payload = result_data.get("trace") if result_data else None
|
|
302
|
+
# Compute resident_time if not already present (for consistency)
|
|
303
|
+
if trace_payload:
|
|
304
|
+
trace_payload = _compute_resident_times(trace_payload)
|
|
305
|
+
self.traces.append(trace_payload if trace_payload else None)
|
|
306
|
+
|
|
250
307
|
# Cleanup retry count if it exists
|
|
251
308
|
if job_index in self.retry_counts:
|
|
252
309
|
del self.retry_counts[job_index]
|
|
@@ -438,7 +495,7 @@ class _ConcurrentProcessor:
|
|
|
438
495
|
|
|
439
496
|
return batch_futures_dict, normalized_job_indices
|
|
440
497
|
|
|
441
|
-
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
|
|
498
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
442
499
|
"""
|
|
443
500
|
Executes the main processing loop in batches.
|
|
444
501
|
|
|
@@ -581,7 +638,7 @@ class _ConcurrentProcessor:
|
|
|
581
638
|
# --- Final Logging ---
|
|
582
639
|
self._log_final_status(total_jobs)
|
|
583
640
|
|
|
584
|
-
return self.results, self.failures
|
|
641
|
+
return self.results, self.failures, self.traces if self.return_traces else []
|
|
585
642
|
|
|
586
643
|
|
|
587
644
|
class NvIngestClient:
|
|
@@ -1212,7 +1269,12 @@ class NvIngestClient:
|
|
|
1212
1269
|
stream_to_callback_only: bool = False,
|
|
1213
1270
|
return_full_response: bool = False,
|
|
1214
1271
|
verbose: bool = False,
|
|
1215
|
-
|
|
1272
|
+
return_traces: bool = False,
|
|
1273
|
+
) -> Union[
|
|
1274
|
+
List[Any],
|
|
1275
|
+
Tuple[List[Any], List[Tuple[str, str]]],
|
|
1276
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
|
|
1277
|
+
]:
|
|
1216
1278
|
"""
|
|
1217
1279
|
Submit and fetch multiple jobs concurrently.
|
|
1218
1280
|
|
|
@@ -1247,6 +1309,8 @@ class NvIngestClient:
|
|
|
1247
1309
|
Ignored when stream_to_callback_only=True. Default is False.
|
|
1248
1310
|
verbose : bool, optional
|
|
1249
1311
|
If True, enable debug logging. Default is False.
|
|
1312
|
+
return_traces : bool, optional
|
|
1313
|
+
If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
|
|
1250
1314
|
|
|
1251
1315
|
Returns
|
|
1252
1316
|
-------
|
|
@@ -1254,6 +1318,9 @@ class NvIngestClient:
|
|
|
1254
1318
|
List of successful job results when `return_failures` is False.
|
|
1255
1319
|
results, failures : tuple
|
|
1256
1320
|
Tuple of (successful results, failure tuples) when `return_failures` is True.
|
|
1321
|
+
results, failures, traces : tuple
|
|
1322
|
+
Tuple of (successful results, failure tuples, trace dicts) when both
|
|
1323
|
+
`return_failures` and `return_traces` are True.
|
|
1257
1324
|
|
|
1258
1325
|
Raises
|
|
1259
1326
|
------
|
|
@@ -1266,7 +1333,12 @@ class NvIngestClient:
|
|
|
1266
1333
|
|
|
1267
1334
|
# Handle empty input
|
|
1268
1335
|
if not job_indices:
|
|
1269
|
-
|
|
1336
|
+
if return_failures and return_traces:
|
|
1337
|
+
return [], [], []
|
|
1338
|
+
elif return_failures:
|
|
1339
|
+
return [], []
|
|
1340
|
+
else:
|
|
1341
|
+
return []
|
|
1270
1342
|
|
|
1271
1343
|
# Validate and set batch_size
|
|
1272
1344
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
@@ -1289,12 +1361,17 @@ class NvIngestClient:
|
|
|
1289
1361
|
stream_to_callback_only=stream_to_callback_only,
|
|
1290
1362
|
return_full_response=return_full_response,
|
|
1291
1363
|
verbose=verbose,
|
|
1364
|
+
return_traces=return_traces,
|
|
1292
1365
|
)
|
|
1293
1366
|
|
|
1294
|
-
results, failures = processor.run()
|
|
1367
|
+
results, failures, traces = processor.run()
|
|
1295
1368
|
|
|
1296
|
-
if return_failures:
|
|
1369
|
+
if return_failures and return_traces:
|
|
1370
|
+
return results, failures, traces
|
|
1371
|
+
elif return_failures:
|
|
1297
1372
|
return results, failures
|
|
1373
|
+
elif return_traces:
|
|
1374
|
+
return results, traces
|
|
1298
1375
|
|
|
1299
1376
|
if failures:
|
|
1300
1377
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
@@ -402,16 +402,9 @@ class Ingestor:
|
|
|
402
402
|
show_progress: bool = False,
|
|
403
403
|
return_failures: bool = False,
|
|
404
404
|
save_to_disk: bool = False,
|
|
405
|
+
return_traces: bool = False,
|
|
405
406
|
**kwargs: Any,
|
|
406
|
-
) -> Union[
|
|
407
|
-
List[List[Dict[str, Any]]], # In-memory: List of response['data'] for each doc
|
|
408
|
-
List[Dict[str, Any]], # In-memory: Full response envelopes when return_full_response=True
|
|
409
|
-
List[LazyLoadedList], # Disk: List of proxies, one per original doc
|
|
410
|
-
Tuple[
|
|
411
|
-
Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
|
|
412
|
-
List[Tuple[str, str]],
|
|
413
|
-
],
|
|
414
|
-
]: # noqa: E501
|
|
407
|
+
) -> Union[List[Any], Tuple[Any, ...]]:
|
|
415
408
|
"""
|
|
416
409
|
Ingest documents by submitting jobs and fetching results concurrently.
|
|
417
410
|
|
|
@@ -421,24 +414,30 @@ class Ingestor:
|
|
|
421
414
|
Whether to display a progress bar. Default is False.
|
|
422
415
|
return_failures : bool, optional
|
|
423
416
|
If True, return a tuple (results, failures); otherwise, return only results. Default is False.
|
|
417
|
+
save_to_disk : bool, optional
|
|
418
|
+
If True, save results to disk and return LazyLoadedList proxies. Default is False.
|
|
419
|
+
return_traces : bool, optional
|
|
420
|
+
If True, return trace metrics alongside results. Default is False.
|
|
421
|
+
Traces contain timing metrics (entry, exit, resident_time) for each stage.
|
|
424
422
|
**kwargs : Any
|
|
425
|
-
Additional keyword arguments for the underlying client methods.
|
|
426
|
-
'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
|
|
427
|
-
'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
|
|
428
|
-
through to process_jobs_concurrently.
|
|
423
|
+
Additional keyword arguments for the underlying client methods.
|
|
429
424
|
Optional flags include `include_parent_trace_ids=True` to also return
|
|
430
|
-
parent job trace identifiers
|
|
425
|
+
parent job trace identifiers (V2 API only).
|
|
431
426
|
|
|
432
427
|
Returns
|
|
433
428
|
-------
|
|
434
|
-
|
|
435
|
-
|
|
429
|
+
list or tuple
|
|
430
|
+
Returns vary based on flags:
|
|
431
|
+
- Default: list of results
|
|
432
|
+
- return_failures=True: (results, failures)
|
|
433
|
+
- return_traces=True: (results, traces)
|
|
434
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
435
|
+
- Additional combinations with include_parent_trace_ids kwarg
|
|
436
436
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
list of parent trace IDs is appended to the return value.
|
|
437
|
+
Notes
|
|
438
|
+
-----
|
|
439
|
+
Trace metrics include timing data for each processing stage. For detailed
|
|
440
|
+
usage and examples, see src/nv_ingest/api/v2/README.md
|
|
442
441
|
"""
|
|
443
442
|
if save_to_disk and (not self._output_config):
|
|
444
443
|
self.save_to_disk()
|
|
@@ -574,7 +573,8 @@ class Ingestor:
|
|
|
574
573
|
if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
|
|
575
574
|
self._client.enable_telemetry(bool(enable_telemetry))
|
|
576
575
|
|
|
577
|
-
|
|
576
|
+
# Call process_jobs_concurrently
|
|
577
|
+
proc_result = self._client.process_jobs_concurrently(
|
|
578
578
|
job_indices=self._job_ids,
|
|
579
579
|
job_queue_id=self._job_queue_id,
|
|
580
580
|
timeout=timeout,
|
|
@@ -583,9 +583,17 @@ class Ingestor:
|
|
|
583
583
|
return_failures=True,
|
|
584
584
|
stream_to_callback_only=stream_to_callback_only,
|
|
585
585
|
verbose=verbose,
|
|
586
|
+
return_traces=return_traces,
|
|
586
587
|
**proc_kwargs,
|
|
587
588
|
)
|
|
588
589
|
|
|
590
|
+
# Unpack result based on return_traces flag
|
|
591
|
+
if return_traces:
|
|
592
|
+
results, failures, traces_list = proc_result
|
|
593
|
+
else:
|
|
594
|
+
results, failures = proc_result
|
|
595
|
+
traces_list = [] # Empty list when traces not requested
|
|
596
|
+
|
|
589
597
|
if show_progress and pbar:
|
|
590
598
|
pbar.close()
|
|
591
599
|
|
|
@@ -648,13 +656,18 @@ class Ingestor:
|
|
|
648
656
|
|
|
649
657
|
parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
650
658
|
|
|
651
|
-
|
|
652
|
-
|
|
659
|
+
# Build return tuple based on requested outputs
|
|
660
|
+
# Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
|
|
661
|
+
returns = [results]
|
|
662
|
+
|
|
653
663
|
if return_failures:
|
|
654
|
-
|
|
664
|
+
returns.append(failures)
|
|
665
|
+
if return_traces:
|
|
666
|
+
returns.append(traces_list)
|
|
655
667
|
if include_parent_trace_ids:
|
|
656
|
-
|
|
657
|
-
|
|
668
|
+
returns.append(parent_trace_ids)
|
|
669
|
+
|
|
670
|
+
return tuple(returns) if len(returns) > 1 else results
|
|
658
671
|
|
|
659
672
|
def ingest_async(self, **kwargs: Any) -> Future:
|
|
660
673
|
"""
|
|
@@ -36,6 +36,8 @@ class EmbedTask(Task):
|
|
|
36
36
|
image_elements_modality: Optional[str] = None,
|
|
37
37
|
structured_elements_modality: Optional[str] = None,
|
|
38
38
|
audio_elements_modality: Optional[str] = None,
|
|
39
|
+
custom_content_field: Optional[str] = None,
|
|
40
|
+
result_target_field: Optional[str] = None,
|
|
39
41
|
) -> None:
|
|
40
42
|
"""
|
|
41
43
|
Initialize the EmbedTask configuration.
|
|
@@ -76,6 +78,8 @@ class EmbedTask(Task):
|
|
|
76
78
|
image_elements_modality=image_elements_modality,
|
|
77
79
|
structured_elements_modality=structured_elements_modality,
|
|
78
80
|
audio_elements_modality=audio_elements_modality,
|
|
81
|
+
custom_content_field=custom_content_field,
|
|
82
|
+
result_target_field=result_target_field,
|
|
79
83
|
)
|
|
80
84
|
|
|
81
85
|
self._endpoint_url = validated_data.endpoint_url
|
|
@@ -86,6 +90,8 @@ class EmbedTask(Task):
|
|
|
86
90
|
self._image_elements_modality = validated_data.image_elements_modality
|
|
87
91
|
self._structured_elements_modality = validated_data.structured_elements_modality
|
|
88
92
|
self._audio_elements_modality = validated_data.audio_elements_modality
|
|
93
|
+
self._custom_content_field = validated_data.custom_content_field
|
|
94
|
+
self._result_target_field = validated_data.result_target_field
|
|
89
95
|
|
|
90
96
|
def __str__(self) -> str:
|
|
91
97
|
"""
|
|
@@ -114,6 +120,10 @@ class EmbedTask(Task):
|
|
|
114
120
|
info += f" structured_elements_modality: {self._structured_elements_modality}\n"
|
|
115
121
|
if self._audio_elements_modality:
|
|
116
122
|
info += f" audio_elements_modality: {self._audio_elements_modality}\n"
|
|
123
|
+
if self._custom_content_field:
|
|
124
|
+
info += f" custom_content_field: {self._custom_content_field}\n"
|
|
125
|
+
if self._result_target_field:
|
|
126
|
+
info += f" result_target_field: {self.result_target_field}\n"
|
|
117
127
|
return info
|
|
118
128
|
|
|
119
129
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -149,4 +159,10 @@ class EmbedTask(Task):
|
|
|
149
159
|
if self._audio_elements_modality:
|
|
150
160
|
task_properties["audio_elements_modality"] = self._audio_elements_modality
|
|
151
161
|
|
|
162
|
+
if self._custom_content_field:
|
|
163
|
+
task_properties["custom_content_field"] = self._custom_content_field
|
|
164
|
+
|
|
165
|
+
if self._result_target_field:
|
|
166
|
+
task_properties["result_target_field"] = self.result_target_field
|
|
167
|
+
|
|
152
168
|
return {"type": "embed", "task_properties": task_properties}
|
|
@@ -11,6 +11,7 @@ import logging
|
|
|
11
11
|
import importlib
|
|
12
12
|
import inspect
|
|
13
13
|
import ast
|
|
14
|
+
import re
|
|
14
15
|
from typing import Dict, Optional, Union
|
|
15
16
|
|
|
16
17
|
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
|
|
|
122
123
|
3. File path: '/path/to/file.py:my_function'
|
|
123
124
|
4. Legacy import path: 'my_module.my_function' (function name only, no imports)
|
|
124
125
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
126
|
+
# Default to treating as inline unless it clearly matches a
|
|
127
|
+
# module/file specification. This avoids misclassifying inline code that
|
|
128
|
+
# contains colons, imports, or annotations before the def line.
|
|
128
129
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
spec = udf_function_spec.strip()
|
|
131
|
+
|
|
132
|
+
# 1) File path with function: /path/to/file.py:function_name
|
|
133
|
+
if ".py:" in spec:
|
|
134
|
+
file_path, function_name = spec.split(":", 1)
|
|
132
135
|
return _extract_function_with_context(file_path, function_name)
|
|
133
136
|
|
|
134
|
-
|
|
135
|
-
|
|
137
|
+
# 2) File path without function name is an explicit error
|
|
138
|
+
if spec.endswith(".py"):
|
|
136
139
|
raise ValueError(
|
|
137
|
-
f"File path '{udf_function_spec}' is missing function name. "
|
|
138
|
-
f"Use format 'file.py:function_name' to specify which function to use."
|
|
140
|
+
f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
|
|
139
141
|
)
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
# 3) Module path with colon: my.module:function
|
|
144
|
+
# Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
|
|
145
|
+
# no whitespace/newlines.
|
|
146
|
+
module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
|
|
147
|
+
if module_colon_pattern.match(spec):
|
|
148
|
+
module_path, function_name = spec.split(":", 1)
|
|
146
149
|
try:
|
|
147
|
-
# Import the module to get its file path
|
|
148
150
|
module = importlib.import_module(module_path)
|
|
149
151
|
module_file = inspect.getfile(module)
|
|
150
|
-
|
|
151
|
-
# Extract the function with full module context
|
|
152
152
|
return _extract_function_with_context(module_file, function_name)
|
|
153
|
-
|
|
154
153
|
except ImportError as e:
|
|
155
154
|
raise ValueError(f"Failed to import module '{module_path}': {e}")
|
|
156
155
|
except Exception as e:
|
|
157
156
|
raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
|
|
158
157
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
func = _load_function_from_import_path(
|
|
163
|
-
|
|
164
|
-
# Get the source code of the function only
|
|
158
|
+
# 4) Legacy import path: my.module.function (no colon)
|
|
159
|
+
legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
|
|
160
|
+
if legacy_import_pattern.match(spec):
|
|
161
|
+
func = _load_function_from_import_path(spec)
|
|
165
162
|
try:
|
|
166
163
|
source = inspect.getsource(func)
|
|
167
164
|
return source
|
|
168
165
|
except (OSError, TypeError) as e:
|
|
169
166
|
raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
|
|
170
167
|
|
|
171
|
-
|
|
172
|
-
|
|
168
|
+
# 5) Default: treat as inline UDF source (entire string)
|
|
169
|
+
return udf_function_spec
|
|
173
170
|
|
|
174
171
|
|
|
175
172
|
class UDFTask(Task):
|
|
@@ -776,13 +776,13 @@ def bulk_insert_milvus(
|
|
|
776
776
|
t_bulk_start = time.time()
|
|
777
777
|
task_ids = []
|
|
778
778
|
|
|
779
|
-
|
|
780
|
-
utility.do_bulk_insert(
|
|
779
|
+
for files in writer.batch_files:
|
|
780
|
+
task_id = utility.do_bulk_insert(
|
|
781
781
|
collection_name=collection_name,
|
|
782
|
-
files=
|
|
782
|
+
files=files,
|
|
783
783
|
consistency_level=CONSISTENCY,
|
|
784
784
|
)
|
|
785
|
-
|
|
785
|
+
task_ids.append(task_id)
|
|
786
786
|
|
|
787
787
|
while len(task_ids) > 0:
|
|
788
788
|
time.sleep(1)
|
|
File without changes
|
{nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/MANIFEST.in
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/README.md
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/pyproject.toml
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/setup.cfg
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.10.26.dev20251026 → nv_ingest_client-2025.10.28.dev20251028}/src/version.py
RENAMED
|
File without changes
|