nv-ingest-client 2025.10.18.dev20251018__tar.gz → 2025.11.14.dev20251114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client.egg-info → nv_ingest_client-2025.11.14.dev20251114}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/client.py +83 -9
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/ingest_job_handler.py +28 -6
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/interface.py +128 -29
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_spec.py +3 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
- nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/extract.py +4 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/milvus.py +44 -20
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/LICENSE +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/README.md +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/pyproject.toml +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/setup.cfg +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/util.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/version.py +0 -0
|
@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
|
|
|
44
44
|
logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute resident_time entries from entry/exit pairs if not already present.
|
|
50
|
+
|
|
51
|
+
This ensures consistency between split jobs (where server computes resident_time)
|
|
52
|
+
and non-split jobs (where we compute it client-side).
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
trace_dict : Dict[str, Any]
|
|
57
|
+
Trace dictionary with entry/exit pairs
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Dict[str, Any]
|
|
62
|
+
Trace dictionary with resident_time entries added
|
|
63
|
+
"""
|
|
64
|
+
if not trace_dict or not isinstance(trace_dict, dict):
|
|
65
|
+
return trace_dict
|
|
66
|
+
|
|
67
|
+
# Check if resident_time already exists (server-computed for split jobs)
|
|
68
|
+
has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
|
|
69
|
+
if has_resident:
|
|
70
|
+
return trace_dict # Already computed by server
|
|
71
|
+
|
|
72
|
+
# Compute resident_time from entry/exit pairs
|
|
73
|
+
result = dict(trace_dict)
|
|
74
|
+
stages = set()
|
|
75
|
+
|
|
76
|
+
# Find all unique stages
|
|
77
|
+
for key in trace_dict:
|
|
78
|
+
if key.startswith("trace::entry::"):
|
|
79
|
+
stages.add(key.replace("trace::entry::", ""))
|
|
80
|
+
|
|
81
|
+
# Compute resident_time for each stage
|
|
82
|
+
for stage in stages:
|
|
83
|
+
entry_key = f"trace::entry::{stage}"
|
|
84
|
+
exit_key = f"trace::exit::{stage}"
|
|
85
|
+
if entry_key in trace_dict and exit_key in trace_dict:
|
|
86
|
+
result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
47
91
|
class DataDecodeException(Exception):
|
|
48
92
|
"""
|
|
49
93
|
Exception raised for errors in decoding data.
|
|
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
|
|
|
87
131
|
stream_to_callback_only: bool,
|
|
88
132
|
return_full_response: bool,
|
|
89
133
|
verbose: bool = False,
|
|
134
|
+
return_traces: bool = False,
|
|
90
135
|
):
|
|
91
136
|
"""
|
|
92
137
|
Initializes the concurrent processor.
|
|
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
|
|
|
120
165
|
initiating job submission or fetching fails for a batch.
|
|
121
166
|
verbose : bool, optional
|
|
122
167
|
If True, enables detailed debug logging. Default is False.
|
|
168
|
+
return_traces : bool, optional
|
|
169
|
+
If True, parent-level trace data for each completed job is stored.
|
|
123
170
|
|
|
124
171
|
Raises
|
|
125
172
|
------
|
|
@@ -142,12 +189,14 @@ class _ConcurrentProcessor:
|
|
|
142
189
|
self.stream_to_callback_only = stream_to_callback_only
|
|
143
190
|
self.return_full_response = return_full_response
|
|
144
191
|
self.verbose = verbose
|
|
192
|
+
self.return_traces = return_traces
|
|
145
193
|
|
|
146
194
|
# State variables managed across batch cycles
|
|
147
195
|
self.retry_job_ids: List[str] = []
|
|
148
196
|
self.retry_counts: Dict[str, int] = defaultdict(int)
|
|
149
197
|
self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
|
|
150
198
|
self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
|
|
199
|
+
self.traces: List[Optional[Dict[str, Any]]] = []
|
|
151
200
|
|
|
152
201
|
# --- Initial Checks ---
|
|
153
202
|
if not self.job_queue_id:
|
|
@@ -247,6 +296,14 @@ class _ConcurrentProcessor:
|
|
|
247
296
|
# When requested, return the full response envelope (includes 'trace' and 'annotations')
|
|
248
297
|
self.results.append(result_data if self.return_full_response else result_data.get("data"))
|
|
249
298
|
|
|
299
|
+
# Extract trace data for all successful (non-failed) jobs
|
|
300
|
+
if self.return_traces and not is_failed:
|
|
301
|
+
trace_payload = result_data.get("trace") if result_data else None
|
|
302
|
+
# Compute resident_time if not already present (for consistency)
|
|
303
|
+
if trace_payload:
|
|
304
|
+
trace_payload = _compute_resident_times(trace_payload)
|
|
305
|
+
self.traces.append(trace_payload if trace_payload else None)
|
|
306
|
+
|
|
250
307
|
# Cleanup retry count if it exists
|
|
251
308
|
if job_index in self.retry_counts:
|
|
252
309
|
del self.retry_counts[job_index]
|
|
@@ -438,7 +495,7 @@ class _ConcurrentProcessor:
|
|
|
438
495
|
|
|
439
496
|
return batch_futures_dict, normalized_job_indices
|
|
440
497
|
|
|
441
|
-
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
|
|
498
|
+
def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
|
|
442
499
|
"""
|
|
443
500
|
Executes the main processing loop in batches.
|
|
444
501
|
|
|
@@ -581,7 +638,7 @@ class _ConcurrentProcessor:
|
|
|
581
638
|
# --- Final Logging ---
|
|
582
639
|
self._log_final_status(total_jobs)
|
|
583
640
|
|
|
584
|
-
return self.results, self.failures
|
|
641
|
+
return self.results, self.failures, self.traces if self.return_traces else []
|
|
585
642
|
|
|
586
643
|
|
|
587
644
|
class NvIngestClient:
|
|
@@ -1212,7 +1269,12 @@ class NvIngestClient:
|
|
|
1212
1269
|
stream_to_callback_only: bool = False,
|
|
1213
1270
|
return_full_response: bool = False,
|
|
1214
1271
|
verbose: bool = False,
|
|
1215
|
-
|
|
1272
|
+
return_traces: bool = False,
|
|
1273
|
+
) -> Union[
|
|
1274
|
+
List[Any],
|
|
1275
|
+
Tuple[List[Any], List[Tuple[str, str]]],
|
|
1276
|
+
Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
|
|
1277
|
+
]:
|
|
1216
1278
|
"""
|
|
1217
1279
|
Submit and fetch multiple jobs concurrently.
|
|
1218
1280
|
|
|
@@ -1247,6 +1309,8 @@ class NvIngestClient:
|
|
|
1247
1309
|
Ignored when stream_to_callback_only=True. Default is False.
|
|
1248
1310
|
verbose : bool, optional
|
|
1249
1311
|
If True, enable debug logging. Default is False.
|
|
1312
|
+
return_traces : bool, optional
|
|
1313
|
+
If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
|
|
1250
1314
|
|
|
1251
1315
|
Returns
|
|
1252
1316
|
-------
|
|
@@ -1254,6 +1318,9 @@ class NvIngestClient:
|
|
|
1254
1318
|
List of successful job results when `return_failures` is False.
|
|
1255
1319
|
results, failures : tuple
|
|
1256
1320
|
Tuple of (successful results, failure tuples) when `return_failures` is True.
|
|
1321
|
+
results, failures, traces : tuple
|
|
1322
|
+
Tuple of (successful results, failure tuples, trace dicts) when both
|
|
1323
|
+
`return_failures` and `return_traces` are True.
|
|
1257
1324
|
|
|
1258
1325
|
Raises
|
|
1259
1326
|
------
|
|
@@ -1266,7 +1333,12 @@ class NvIngestClient:
|
|
|
1266
1333
|
|
|
1267
1334
|
# Handle empty input
|
|
1268
1335
|
if not job_indices:
|
|
1269
|
-
|
|
1336
|
+
if return_failures and return_traces:
|
|
1337
|
+
return [], [], []
|
|
1338
|
+
elif return_failures:
|
|
1339
|
+
return [], []
|
|
1340
|
+
else:
|
|
1341
|
+
return []
|
|
1270
1342
|
|
|
1271
1343
|
# Validate and set batch_size
|
|
1272
1344
|
validated_batch_size = self._validate_batch_size(batch_size)
|
|
@@ -1289,12 +1361,17 @@ class NvIngestClient:
|
|
|
1289
1361
|
stream_to_callback_only=stream_to_callback_only,
|
|
1290
1362
|
return_full_response=return_full_response,
|
|
1291
1363
|
verbose=verbose,
|
|
1364
|
+
return_traces=return_traces,
|
|
1292
1365
|
)
|
|
1293
1366
|
|
|
1294
|
-
results, failures = processor.run()
|
|
1367
|
+
results, failures, traces = processor.run()
|
|
1295
1368
|
|
|
1296
|
-
if return_failures:
|
|
1369
|
+
if return_failures and return_traces:
|
|
1370
|
+
return results, failures, traces
|
|
1371
|
+
elif return_failures:
|
|
1297
1372
|
return results, failures
|
|
1373
|
+
elif return_traces:
|
|
1374
|
+
return results, traces
|
|
1298
1375
|
|
|
1299
1376
|
if failures:
|
|
1300
1377
|
logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
|
|
@@ -1628,9 +1705,6 @@ class NvIngestClient:
|
|
|
1628
1705
|
)
|
|
1629
1706
|
logger.error(error_msg)
|
|
1630
1707
|
failures.append((self._job_index_to_job_spec[job_id].source_id, str(e)))
|
|
1631
|
-
finally:
|
|
1632
|
-
# Clean up the job spec mapping
|
|
1633
|
-
del self._job_index_to_job_spec[job_id]
|
|
1634
1708
|
|
|
1635
1709
|
if return_failures:
|
|
1636
1710
|
return results, failures
|
|
@@ -323,18 +323,40 @@ class IngestJobHandler:
|
|
|
323
323
|
|
|
324
324
|
futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
|
|
325
325
|
for future in as_completed(futures_dict.keys()):
|
|
326
|
+
pages_per_sec = None
|
|
326
327
|
try:
|
|
327
328
|
# Block as each future completes; this mirrors CLI behavior
|
|
328
329
|
future_response, trace_id = self._handle_future_result(future)
|
|
329
330
|
job_id: str = futures_dict[future]
|
|
330
331
|
trace_ids[job_id_map[job_id]] = trace_id
|
|
331
332
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
333
|
+
# Extract page count: prefer V2 metadata location, fall back to V1
|
|
334
|
+
page_count = None
|
|
335
|
+
source_name = None
|
|
336
|
+
|
|
337
|
+
# Try V2 metadata location first (top-level metadata.total_pages)
|
|
338
|
+
if "metadata" in future_response and future_response["metadata"]:
|
|
339
|
+
response_metadata = future_response["metadata"]
|
|
340
|
+
page_count = response_metadata.get("total_pages")
|
|
341
|
+
source_name = response_metadata.get("original_source_name")
|
|
342
|
+
|
|
343
|
+
# Fall back to V1 location (first data element's hierarchy.page_count)
|
|
344
|
+
if page_count is None and future_response.get("data"):
|
|
345
|
+
try:
|
|
346
|
+
first_page_metadata = future_response["data"][0]["metadata"]
|
|
347
|
+
page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
|
|
348
|
+
source_name = first_page_metadata["source_metadata"]["source_name"]
|
|
349
|
+
except (KeyError, IndexError, TypeError):
|
|
350
|
+
# If we can't extract from V1 location, use defaults
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# Use extracted values or defaults
|
|
354
|
+
if page_count is None:
|
|
355
|
+
page_count = 0 # Default if not found
|
|
356
|
+
if source_name is None:
|
|
357
|
+
source_name = "unknown_source"
|
|
358
|
+
|
|
359
|
+
file_page_counts: Dict[str, int] = {source_name: page_count}
|
|
338
360
|
|
|
339
361
|
if self.output_directory:
|
|
340
362
|
self._save_response_data(
|
|
@@ -402,16 +402,9 @@ class Ingestor:
|
|
|
402
402
|
show_progress: bool = False,
|
|
403
403
|
return_failures: bool = False,
|
|
404
404
|
save_to_disk: bool = False,
|
|
405
|
+
return_traces: bool = False,
|
|
405
406
|
**kwargs: Any,
|
|
406
|
-
) -> Union[
|
|
407
|
-
List[List[Dict[str, Any]]], # In-memory: List of response['data'] for each doc
|
|
408
|
-
List[Dict[str, Any]], # In-memory: Full response envelopes when return_full_response=True
|
|
409
|
-
List[LazyLoadedList], # Disk: List of proxies, one per original doc
|
|
410
|
-
Tuple[
|
|
411
|
-
Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
|
|
412
|
-
List[Tuple[str, str]],
|
|
413
|
-
],
|
|
414
|
-
]: # noqa: E501
|
|
407
|
+
) -> Union[List[Any], Tuple[Any, ...]]:
|
|
415
408
|
"""
|
|
416
409
|
Ingest documents by submitting jobs and fetching results concurrently.
|
|
417
410
|
|
|
@@ -421,24 +414,30 @@ class Ingestor:
|
|
|
421
414
|
Whether to display a progress bar. Default is False.
|
|
422
415
|
return_failures : bool, optional
|
|
423
416
|
If True, return a tuple (results, failures); otherwise, return only results. Default is False.
|
|
417
|
+
save_to_disk : bool, optional
|
|
418
|
+
If True, save results to disk and return LazyLoadedList proxies. Default is False.
|
|
419
|
+
return_traces : bool, optional
|
|
420
|
+
If True, return trace metrics alongside results. Default is False.
|
|
421
|
+
Traces contain timing metrics (entry, exit, resident_time) for each stage.
|
|
424
422
|
**kwargs : Any
|
|
425
|
-
Additional keyword arguments for the underlying client methods.
|
|
426
|
-
'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
|
|
427
|
-
'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
|
|
428
|
-
through to process_jobs_concurrently.
|
|
423
|
+
Additional keyword arguments for the underlying client methods.
|
|
429
424
|
Optional flags include `include_parent_trace_ids=True` to also return
|
|
430
|
-
parent job trace identifiers
|
|
425
|
+
parent job trace identifiers (V2 API only).
|
|
431
426
|
|
|
432
427
|
Returns
|
|
433
428
|
-------
|
|
434
|
-
|
|
435
|
-
|
|
429
|
+
list or tuple
|
|
430
|
+
Returns vary based on flags:
|
|
431
|
+
- Default: list of results
|
|
432
|
+
- return_failures=True: (results, failures)
|
|
433
|
+
- return_traces=True: (results, traces)
|
|
434
|
+
- return_failures=True, return_traces=True: (results, failures, traces)
|
|
435
|
+
- Additional combinations with include_parent_trace_ids kwarg
|
|
436
436
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
list of parent trace IDs is appended to the return value.
|
|
437
|
+
Notes
|
|
438
|
+
-----
|
|
439
|
+
Trace metrics include timing data for each processing stage. For detailed
|
|
440
|
+
usage and examples, see src/nv_ingest/api/v2/README.md
|
|
442
441
|
"""
|
|
443
442
|
if save_to_disk and (not self._output_config):
|
|
444
443
|
self.save_to_disk()
|
|
@@ -574,7 +573,8 @@ class Ingestor:
|
|
|
574
573
|
if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
|
|
575
574
|
self._client.enable_telemetry(bool(enable_telemetry))
|
|
576
575
|
|
|
577
|
-
|
|
576
|
+
# Call process_jobs_concurrently
|
|
577
|
+
proc_result = self._client.process_jobs_concurrently(
|
|
578
578
|
job_indices=self._job_ids,
|
|
579
579
|
job_queue_id=self._job_queue_id,
|
|
580
580
|
timeout=timeout,
|
|
@@ -583,9 +583,17 @@ class Ingestor:
|
|
|
583
583
|
return_failures=True,
|
|
584
584
|
stream_to_callback_only=stream_to_callback_only,
|
|
585
585
|
verbose=verbose,
|
|
586
|
+
return_traces=return_traces,
|
|
586
587
|
**proc_kwargs,
|
|
587
588
|
)
|
|
588
589
|
|
|
590
|
+
# Unpack result based on return_traces flag
|
|
591
|
+
if return_traces:
|
|
592
|
+
results, failures, traces_list = proc_result
|
|
593
|
+
else:
|
|
594
|
+
results, failures = proc_result
|
|
595
|
+
traces_list = [] # Empty list when traces not requested
|
|
596
|
+
|
|
589
597
|
if show_progress and pbar:
|
|
590
598
|
pbar.close()
|
|
591
599
|
|
|
@@ -648,13 +656,18 @@ class Ingestor:
|
|
|
648
656
|
|
|
649
657
|
parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
|
|
650
658
|
|
|
651
|
-
|
|
652
|
-
|
|
659
|
+
# Build return tuple based on requested outputs
|
|
660
|
+
# Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
|
|
661
|
+
returns = [results]
|
|
662
|
+
|
|
653
663
|
if return_failures:
|
|
654
|
-
|
|
664
|
+
returns.append(failures)
|
|
665
|
+
if return_traces:
|
|
666
|
+
returns.append(traces_list)
|
|
655
667
|
if include_parent_trace_ids:
|
|
656
|
-
|
|
657
|
-
|
|
668
|
+
returns.append(parent_trace_ids)
|
|
669
|
+
|
|
670
|
+
return tuple(returns) if len(returns) > 1 else results
|
|
658
671
|
|
|
659
672
|
def ingest_async(self, **kwargs: Any) -> Future:
|
|
660
673
|
"""
|
|
@@ -681,6 +694,7 @@ class Ingestor:
|
|
|
681
694
|
submitted_futures = set(future_to_job_id.keys())
|
|
682
695
|
completed_futures = set()
|
|
683
696
|
future_results = []
|
|
697
|
+
vdb_future = None
|
|
684
698
|
|
|
685
699
|
def _done_callback(future):
|
|
686
700
|
job_id = future_to_job_id[future]
|
|
@@ -702,9 +716,10 @@ class Ingestor:
|
|
|
702
716
|
future.add_done_callback(_done_callback)
|
|
703
717
|
|
|
704
718
|
if self._vdb_bulk_upload:
|
|
705
|
-
|
|
719
|
+
executor = ThreadPoolExecutor(max_workers=1)
|
|
720
|
+
vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
|
|
706
721
|
|
|
707
|
-
return combined_future
|
|
722
|
+
return combined_future if not vdb_future else vdb_future
|
|
708
723
|
|
|
709
724
|
@ensure_job_specs
|
|
710
725
|
def _prepare_ingest_run(self):
|
|
@@ -821,6 +836,7 @@ class Ingestor:
|
|
|
821
836
|
extract_tables = kwargs.pop("extract_tables", True)
|
|
822
837
|
extract_charts = kwargs.pop("extract_charts", True)
|
|
823
838
|
extract_page_as_image = kwargs.pop("extract_page_as_image", False)
|
|
839
|
+
table_output_format = kwargs.pop("table_output_format", "markdown")
|
|
824
840
|
|
|
825
841
|
# Defaulting to False since enabling infographic extraction reduces throughput.
|
|
826
842
|
# Users have to set to True if infographic extraction is required.
|
|
@@ -843,6 +859,7 @@ class Ingestor:
|
|
|
843
859
|
extract_charts=extract_charts,
|
|
844
860
|
extract_infographics=extract_infographics,
|
|
845
861
|
extract_page_as_image=extract_page_as_image,
|
|
862
|
+
table_output_format=table_output_format,
|
|
846
863
|
**kwargs,
|
|
847
864
|
)
|
|
848
865
|
|
|
@@ -1346,3 +1363,85 @@ class Ingestor:
|
|
|
1346
1363
|
terminal_jobs = self.completed_jobs() + self.failed_jobs() + self.cancelled_jobs()
|
|
1347
1364
|
|
|
1348
1365
|
return len(self._job_states) - terminal_jobs
|
|
1366
|
+
|
|
1367
|
+
def get_status(self) -> Dict[str, str]:
|
|
1368
|
+
"""
|
|
1369
|
+
Returns a dictionary mapping document identifiers to their current status in the pipeline.
|
|
1370
|
+
|
|
1371
|
+
This method is designed for use with async ingestion to poll the status of submitted jobs.
|
|
1372
|
+
For each document submitted to the ingestor, the method returns its current processing state.
|
|
1373
|
+
|
|
1374
|
+
Returns
|
|
1375
|
+
-------
|
|
1376
|
+
Dict[str, str]
|
|
1377
|
+
A dictionary where:
|
|
1378
|
+
- Keys are document identifiers (source names or source IDs)
|
|
1379
|
+
- Values are status strings representing the current state:
|
|
1380
|
+
* "pending": Job created but not yet submitted
|
|
1381
|
+
* "submitted": Job submitted and waiting for processing
|
|
1382
|
+
* "processing": Job is currently being processed
|
|
1383
|
+
* "completed": Job finished successfully
|
|
1384
|
+
* "failed": Job encountered an error
|
|
1385
|
+
* "cancelled": Job was cancelled
|
|
1386
|
+
* "unknown": Job state could not be determined (initial state)
|
|
1387
|
+
|
|
1388
|
+
Examples
|
|
1389
|
+
--------
|
|
1390
|
+
>>> ingestor = Ingestor(documents=["doc1.pdf", "doc2.pdf"], client=client)
|
|
1391
|
+
>>> ingestor.extract().embed()
|
|
1392
|
+
>>> future = ingestor.ingest_async()
|
|
1393
|
+
>>>
|
|
1394
|
+
>>> # Poll status while processing
|
|
1395
|
+
>>> status = ingestor.get_status()
|
|
1396
|
+
>>> print(status)
|
|
1397
|
+
{'doc1.pdf': 'processing', 'doc2.pdf': 'submitted'}
|
|
1398
|
+
>>>
|
|
1399
|
+
>>> # Check again after some time
|
|
1400
|
+
>>> status = ingestor.get_status()
|
|
1401
|
+
>>> print(status)
|
|
1402
|
+
{'doc1.pdf': 'completed', 'doc2.pdf': 'processing'}
|
|
1403
|
+
|
|
1404
|
+
Notes
|
|
1405
|
+
-----
|
|
1406
|
+
- This method is most useful when called after `ingest_async()` to track progress
|
|
1407
|
+
- If called before any jobs are submitted, returns an empty dictionary or
|
|
1408
|
+
documents with "unknown" status
|
|
1409
|
+
- The method accesses internal job state from the client, so it reflects
|
|
1410
|
+
the most current known state
|
|
1411
|
+
"""
|
|
1412
|
+
status_dict = {}
|
|
1413
|
+
|
|
1414
|
+
if not self._job_states:
|
|
1415
|
+
# If job states haven't been initialized yet (before ingest_async is called)
|
|
1416
|
+
# Return unknown status for all documents
|
|
1417
|
+
for doc in self._documents:
|
|
1418
|
+
doc_name = os.path.basename(doc) if isinstance(doc, str) else str(doc)
|
|
1419
|
+
status_dict[doc_name] = "unknown"
|
|
1420
|
+
return status_dict
|
|
1421
|
+
|
|
1422
|
+
# Map job IDs to their states and source identifiers
|
|
1423
|
+
for job_id, job_state in self._job_states.items():
|
|
1424
|
+
# Get the job spec to find the source identifier
|
|
1425
|
+
job_spec = self._client._job_index_to_job_spec.get(job_id)
|
|
1426
|
+
|
|
1427
|
+
if job_spec:
|
|
1428
|
+
# Use source_name as the key (the document name)
|
|
1429
|
+
source_identifier = job_spec.source_name
|
|
1430
|
+
else:
|
|
1431
|
+
# Fallback to job_id if we can't find the spec
|
|
1432
|
+
source_identifier = f"job_{job_id}"
|
|
1433
|
+
|
|
1434
|
+
# Map the JobStateEnum to a user-friendly string
|
|
1435
|
+
state_mapping = {
|
|
1436
|
+
JobStateEnum.PENDING: "pending",
|
|
1437
|
+
JobStateEnum.SUBMITTED_ASYNC: "submitted",
|
|
1438
|
+
JobStateEnum.SUBMITTED: "submitted",
|
|
1439
|
+
JobStateEnum.PROCESSING: "processing",
|
|
1440
|
+
JobStateEnum.COMPLETED: "completed",
|
|
1441
|
+
JobStateEnum.FAILED: "failed",
|
|
1442
|
+
JobStateEnum.CANCELLED: "cancelled",
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
status_dict[source_identifier] = state_mapping.get(job_state.state, "unknown")
|
|
1446
|
+
|
|
1447
|
+
return status_dict
|
|
@@ -18,6 +18,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
|
|
|
18
18
|
from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
|
|
19
19
|
from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
|
|
20
20
|
from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
|
|
21
|
+
from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
|
|
21
22
|
from nv_ingest_client.util.dataset import get_dataset_files
|
|
22
23
|
from nv_ingest_client.util.dataset import get_dataset_statistics
|
|
23
24
|
|
|
@@ -199,6 +200,8 @@ class JobSpec:
|
|
|
199
200
|
self._tasks.append(ChartExtractionTask())
|
|
200
201
|
if isinstance(task, ExtractTask) and (task._extract_infographics is True):
|
|
201
202
|
self._tasks.append(InfographicExtractionTask())
|
|
203
|
+
if isinstance(task, ExtractTask) and (task._extract_method in {"ocr"}):
|
|
204
|
+
self._tasks.append(OCRExtractionTask())
|
|
202
205
|
if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
|
|
203
206
|
extract_audio_params = task._extract_audio_params or {}
|
|
204
207
|
self._tasks.append(AudioExtractionTask(**extract_audio_params))
|
|
@@ -36,6 +36,9 @@ class EmbedTask(Task):
|
|
|
36
36
|
image_elements_modality: Optional[str] = None,
|
|
37
37
|
structured_elements_modality: Optional[str] = None,
|
|
38
38
|
audio_elements_modality: Optional[str] = None,
|
|
39
|
+
custom_content_field: Optional[str] = None,
|
|
40
|
+
result_target_field: Optional[str] = None,
|
|
41
|
+
dimensions: Optional[int] = None,
|
|
39
42
|
) -> None:
|
|
40
43
|
"""
|
|
41
44
|
Initialize the EmbedTask configuration.
|
|
@@ -76,6 +79,9 @@ class EmbedTask(Task):
|
|
|
76
79
|
image_elements_modality=image_elements_modality,
|
|
77
80
|
structured_elements_modality=structured_elements_modality,
|
|
78
81
|
audio_elements_modality=audio_elements_modality,
|
|
82
|
+
custom_content_field=custom_content_field,
|
|
83
|
+
result_target_field=result_target_field,
|
|
84
|
+
dimensions=dimensions,
|
|
79
85
|
)
|
|
80
86
|
|
|
81
87
|
self._endpoint_url = validated_data.endpoint_url
|
|
@@ -86,6 +92,9 @@ class EmbedTask(Task):
|
|
|
86
92
|
self._image_elements_modality = validated_data.image_elements_modality
|
|
87
93
|
self._structured_elements_modality = validated_data.structured_elements_modality
|
|
88
94
|
self._audio_elements_modality = validated_data.audio_elements_modality
|
|
95
|
+
self._custom_content_field = validated_data.custom_content_field
|
|
96
|
+
self._result_target_field = validated_data.result_target_field
|
|
97
|
+
self._dimensions = validated_data.dimensions
|
|
89
98
|
|
|
90
99
|
def __str__(self) -> str:
|
|
91
100
|
"""
|
|
@@ -114,6 +123,12 @@ class EmbedTask(Task):
|
|
|
114
123
|
info += f" structured_elements_modality: {self._structured_elements_modality}\n"
|
|
115
124
|
if self._audio_elements_modality:
|
|
116
125
|
info += f" audio_elements_modality: {self._audio_elements_modality}\n"
|
|
126
|
+
if self._custom_content_field:
|
|
127
|
+
info += f" custom_content_field: {self._custom_content_field}\n"
|
|
128
|
+
if self._result_target_field:
|
|
129
|
+
info += f" result_target_field: {self.result_target_field}\n"
|
|
130
|
+
if self._dimensions:
|
|
131
|
+
info += f" dimensions: {self._dimensions}\n"
|
|
117
132
|
return info
|
|
118
133
|
|
|
119
134
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -149,4 +164,13 @@ class EmbedTask(Task):
|
|
|
149
164
|
if self._audio_elements_modality:
|
|
150
165
|
task_properties["audio_elements_modality"] = self._audio_elements_modality
|
|
151
166
|
|
|
167
|
+
if self._custom_content_field:
|
|
168
|
+
task_properties["custom_content_field"] = self._custom_content_field
|
|
169
|
+
|
|
170
|
+
if self._result_target_field:
|
|
171
|
+
task_properties["result_target_field"] = self._result_target_field
|
|
172
|
+
|
|
173
|
+
if self._dimensions:
|
|
174
|
+
task_properties["dimensions"] = self._dimensions
|
|
175
|
+
|
|
152
176
|
return {"type": "embed", "task_properties": task_properties}
|
nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# pylint: disable=too-few-public-methods
|
|
7
|
+
# pylint: disable=too-many-arguments
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Dict
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
|
|
13
|
+
from nv_ingest_client.primitives.tasks.task_base import Task
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OCRExtractionTask(Task):
|
|
19
|
+
"""
|
|
20
|
+
Object for ocr extraction task
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, params: dict = None) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Setup OCR Extraction Task Config
|
|
26
|
+
"""
|
|
27
|
+
super().__init__()
|
|
28
|
+
|
|
29
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
30
|
+
if params is None:
|
|
31
|
+
params = {}
|
|
32
|
+
|
|
33
|
+
# Use the API schema for validation
|
|
34
|
+
validated_data = IngestTaskOCRExtraction(params=params)
|
|
35
|
+
|
|
36
|
+
self._params = validated_data.params
|
|
37
|
+
|
|
38
|
+
def __str__(self) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Returns a string with the object's config and run time state
|
|
41
|
+
"""
|
|
42
|
+
info = ""
|
|
43
|
+
info += "OCR Extraction Task:\n"
|
|
44
|
+
info += f" params: {self._params}\n"
|
|
45
|
+
return info
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> Dict:
|
|
48
|
+
"""
|
|
49
|
+
Convert to a dict for submission to redis
|
|
50
|
+
"""
|
|
51
|
+
task_properties = {
|
|
52
|
+
"params": self._params,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return {"type": "ocr_data_extract", "task_properties": task_properties}
|
|
@@ -11,6 +11,7 @@ import logging
|
|
|
11
11
|
import importlib
|
|
12
12
|
import inspect
|
|
13
13
|
import ast
|
|
14
|
+
import re
|
|
14
15
|
from typing import Dict, Optional, Union
|
|
15
16
|
|
|
16
17
|
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
|
|
|
122
123
|
3. File path: '/path/to/file.py:my_function'
|
|
123
124
|
4. Legacy import path: 'my_module.my_function' (function name only, no imports)
|
|
124
125
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
126
|
+
# Default to treating as inline unless it clearly matches a
|
|
127
|
+
# module/file specification. This avoids misclassifying inline code that
|
|
128
|
+
# contains colons, imports, or annotations before the def line.
|
|
128
129
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
spec = udf_function_spec.strip()
|
|
131
|
+
|
|
132
|
+
# 1) File path with function: /path/to/file.py:function_name
|
|
133
|
+
if ".py:" in spec:
|
|
134
|
+
file_path, function_name = spec.split(":", 1)
|
|
132
135
|
return _extract_function_with_context(file_path, function_name)
|
|
133
136
|
|
|
134
|
-
|
|
135
|
-
|
|
137
|
+
# 2) File path without function name is an explicit error
|
|
138
|
+
if spec.endswith(".py"):
|
|
136
139
|
raise ValueError(
|
|
137
|
-
f"File path '{udf_function_spec}' is missing function name. "
|
|
138
|
-
f"Use format 'file.py:function_name' to specify which function to use."
|
|
140
|
+
f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
|
|
139
141
|
)
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
# 3) Module path with colon: my.module:function
|
|
144
|
+
# Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
|
|
145
|
+
# no whitespace/newlines.
|
|
146
|
+
module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
|
|
147
|
+
if module_colon_pattern.match(spec):
|
|
148
|
+
module_path, function_name = spec.split(":", 1)
|
|
146
149
|
try:
|
|
147
|
-
# Import the module to get its file path
|
|
148
150
|
module = importlib.import_module(module_path)
|
|
149
151
|
module_file = inspect.getfile(module)
|
|
150
|
-
|
|
151
|
-
# Extract the function with full module context
|
|
152
152
|
return _extract_function_with_context(module_file, function_name)
|
|
153
|
-
|
|
154
153
|
except ImportError as e:
|
|
155
154
|
raise ValueError(f"Failed to import module '{module_path}': {e}")
|
|
156
155
|
except Exception as e:
|
|
157
156
|
raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
|
|
158
157
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
func = _load_function_from_import_path(
|
|
163
|
-
|
|
164
|
-
# Get the source code of the function only
|
|
158
|
+
# 4) Legacy import path: my.module.function (no colon)
|
|
159
|
+
legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
|
|
160
|
+
if legacy_import_pattern.match(spec):
|
|
161
|
+
func = _load_function_from_import_path(spec)
|
|
165
162
|
try:
|
|
166
163
|
source = inspect.getsource(func)
|
|
167
164
|
return source
|
|
168
165
|
except (OSError, TypeError) as e:
|
|
169
166
|
raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
|
|
170
167
|
|
|
171
|
-
|
|
172
|
-
|
|
168
|
+
# 5) Default: treat as inline UDF source (entire string)
|
|
169
|
+
return udf_function_spec
|
|
173
170
|
|
|
174
171
|
|
|
175
172
|
class UDFTask(Task):
|
|
@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
|
|
|
51
51
|
"txt": DocumentTypeEnum.TXT,
|
|
52
52
|
"mp3": DocumentTypeEnum.MP3,
|
|
53
53
|
"wav": DocumentTypeEnum.WAV,
|
|
54
|
+
"mp4": DocumentTypeEnum.MP4,
|
|
55
|
+
"mov": DocumentTypeEnum.MOV,
|
|
56
|
+
"avi": DocumentTypeEnum.AVI,
|
|
57
|
+
"mkv": DocumentTypeEnum.MKV,
|
|
54
58
|
# Add more as needed
|
|
55
59
|
}
|
|
56
60
|
|
|
@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
|
|
|
44
44
|
logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
46
|
CONSISTENCY = CONSISTENCY_BOUNDED
|
|
47
|
+
DENSE_INDEX_NAME = "dense_index"
|
|
47
48
|
|
|
48
49
|
pandas_reader_map = {
|
|
49
50
|
".json": pd.read_json,
|
|
@@ -93,7 +94,7 @@ def create_meta_collection(
|
|
|
93
94
|
index_params = MilvusClient.prepare_index_params()
|
|
94
95
|
index_params.add_index(
|
|
95
96
|
field_name="vector",
|
|
96
|
-
index_name=
|
|
97
|
+
index_name=DENSE_INDEX_NAME,
|
|
97
98
|
index_type="FLAT",
|
|
98
99
|
metric_type="L2",
|
|
99
100
|
)
|
|
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
|
|
|
313
314
|
if local_index:
|
|
314
315
|
index_params.add_index(
|
|
315
316
|
field_name="vector",
|
|
316
|
-
index_name=
|
|
317
|
+
index_name=DENSE_INDEX_NAME,
|
|
317
318
|
index_type="FLAT",
|
|
318
319
|
metric_type="L2",
|
|
319
320
|
)
|
|
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
|
|
|
321
322
|
if gpu_index:
|
|
322
323
|
index_params.add_index(
|
|
323
324
|
field_name="vector",
|
|
324
|
-
index_name=
|
|
325
|
+
index_name=DENSE_INDEX_NAME,
|
|
325
326
|
index_type="GPU_CAGRA",
|
|
326
327
|
metric_type="L2",
|
|
327
328
|
params={
|
|
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
|
|
|
335
336
|
else:
|
|
336
337
|
index_params.add_index(
|
|
337
338
|
field_name="vector",
|
|
338
|
-
index_name=
|
|
339
|
+
index_name=DENSE_INDEX_NAME,
|
|
339
340
|
index_type="HNSW",
|
|
340
341
|
metric_type="L2",
|
|
341
342
|
params={"M": 64, "efConstruction": 512},
|
|
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
|
|
|
493
494
|
if isinstance(indexes, dict):
|
|
494
495
|
# Old Milvus behavior (< 2.5.6)
|
|
495
496
|
for k, v in indexes.items():
|
|
496
|
-
if k[1] ==
|
|
497
|
+
if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
|
|
497
498
|
d_idx = v._index_type
|
|
498
499
|
if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
|
|
499
500
|
s_idx = v._index_type
|
|
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
|
|
|
504
505
|
index_name = getattr(idx, "index_name", None)
|
|
505
506
|
index_type = getattr(idx, "index_type", None)
|
|
506
507
|
|
|
507
|
-
if index_name ==
|
|
508
|
+
if index_name == DENSE_INDEX_NAME:
|
|
508
509
|
d_idx = index_type
|
|
509
510
|
if sparse and index_name == "sparse_index":
|
|
510
511
|
s_idx = index_type
|
|
@@ -776,13 +777,13 @@ def bulk_insert_milvus(
|
|
|
776
777
|
t_bulk_start = time.time()
|
|
777
778
|
task_ids = []
|
|
778
779
|
|
|
779
|
-
|
|
780
|
-
utility.do_bulk_insert(
|
|
780
|
+
for files in writer.batch_files:
|
|
781
|
+
task_id = utility.do_bulk_insert(
|
|
781
782
|
collection_name=collection_name,
|
|
782
|
-
files=
|
|
783
|
+
files=files,
|
|
783
784
|
consistency_level=CONSISTENCY,
|
|
784
785
|
)
|
|
785
|
-
|
|
786
|
+
task_ids.append(task_id)
|
|
786
787
|
|
|
787
788
|
while len(task_ids) > 0:
|
|
788
789
|
time.sleep(1)
|
|
@@ -900,30 +901,32 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
|
|
|
900
901
|
(refer to MilvusClient.refresh_load for bulk inserts).
|
|
901
902
|
"""
|
|
902
903
|
client.flush(collection_name)
|
|
903
|
-
index_names = utility.list_indexes(collection_name)
|
|
904
|
+
# index_names = utility.list_indexes(collection_name)
|
|
904
905
|
indexed_rows = 0
|
|
905
|
-
|
|
906
|
+
# observe dense_index, all indexes get populated simultaneously
|
|
907
|
+
for index_name in [DENSE_INDEX_NAME]:
|
|
906
908
|
indexed_rows = 0
|
|
907
|
-
|
|
909
|
+
expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
|
|
910
|
+
while indexed_rows < expected_rows:
|
|
908
911
|
pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
|
|
909
912
|
for i in range(20):
|
|
910
|
-
|
|
913
|
+
current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
|
|
911
914
|
time.sleep(1)
|
|
912
915
|
logger.info(
|
|
913
|
-
f"
|
|
916
|
+
f"Indexed rows, {collection_name}, {index_name} - {current_indexed_rows} / {expected_rows}"
|
|
914
917
|
)
|
|
915
|
-
if
|
|
916
|
-
indexed_rows =
|
|
918
|
+
if current_indexed_rows == expected_rows:
|
|
919
|
+
indexed_rows = current_indexed_rows
|
|
917
920
|
break
|
|
918
921
|
# check if indexed_rows is staying the same, too many times means something is wrong
|
|
919
|
-
if
|
|
922
|
+
if current_indexed_rows == indexed_rows:
|
|
920
923
|
pos_movement -= 1
|
|
921
924
|
else:
|
|
922
925
|
pos_movement = 10
|
|
923
926
|
# if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
|
|
924
927
|
if pos_movement == 0:
|
|
925
|
-
raise ValueError("Rows are not getting indexed as expected")
|
|
926
|
-
indexed_rows =
|
|
928
|
+
raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
|
|
929
|
+
indexed_rows = current_indexed_rows
|
|
927
930
|
return indexed_rows
|
|
928
931
|
|
|
929
932
|
|
|
@@ -2057,3 +2060,24 @@ class Milvus(VDB):
|
|
|
2057
2060
|
self.write_to_index(records, collection_name=coll_name, **sub_write_params)
|
|
2058
2061
|
else:
|
|
2059
2062
|
raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
|
|
2063
|
+
return records
|
|
2064
|
+
|
|
2065
|
+
def run_async(self, records):
|
|
2066
|
+
collection_name, create_params = self.get_connection_params()
|
|
2067
|
+
_, write_params = self.get_write_params()
|
|
2068
|
+
if isinstance(collection_name, str):
|
|
2069
|
+
logger.info(f"creating index - {collection_name}")
|
|
2070
|
+
self.create_index(collection_name=collection_name, **create_params)
|
|
2071
|
+
records = records.result()
|
|
2072
|
+
logger.info(f"writing to index, for collection - {collection_name}")
|
|
2073
|
+
self.write_to_index(records, **write_params)
|
|
2074
|
+
elif isinstance(collection_name, dict):
|
|
2075
|
+
split_params_list = _dict_to_params(collection_name, write_params)
|
|
2076
|
+
for sub_params in split_params_list:
|
|
2077
|
+
coll_name, sub_write_params = sub_params
|
|
2078
|
+
sub_write_params.pop("collection_name", None)
|
|
2079
|
+
self.create_index(collection_name=coll_name, **create_params)
|
|
2080
|
+
self.write_to_index(records, collection_name=coll_name, **sub_write_params)
|
|
2081
|
+
else:
|
|
2082
|
+
raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
|
|
2083
|
+
return records
|
|
@@ -34,6 +34,7 @@ src/nv_ingest_client/primitives/tasks/embed.py
|
|
|
34
34
|
src/nv_ingest_client/primitives/tasks/extract.py
|
|
35
35
|
src/nv_ingest_client/primitives/tasks/filter.py
|
|
36
36
|
src/nv_ingest_client/primitives/tasks/infographic_extraction.py
|
|
37
|
+
src/nv_ingest_client/primitives/tasks/ocr_extraction.py
|
|
37
38
|
src/nv_ingest_client/primitives/tasks/split.py
|
|
38
39
|
src/nv_ingest_client/primitives/tasks/store.py
|
|
39
40
|
src/nv_ingest_client/primitives/tasks/table_extraction.py
|
|
File without changes
|
{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/MANIFEST.in
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/README.md
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/pyproject.toml
RENAMED
|
File without changes
|
{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/setup.cfg
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/version.py
RENAMED
|
File without changes
|