nv-ingest-client 2025.9.30.dev20250930__tar.gz → 2025.10.1.dev20251001__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- {nv_ingest_client-2025.9.30.dev20250930/src/nv_ingest_client.egg-info → nv_ingest_client-2025.10.1.dev20251001}/PKG-INFO +1 -1
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/client/interface.py +24 -5
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/client/util/processing.py +11 -1
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/LICENSE +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/MANIFEST.in +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/README.md +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/pyproject.toml +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/setup.cfg +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/cli/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/cli/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/cli/util/click.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/cli/util/processing.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/cli/util/system.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/client/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/client/client.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/dataset.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/document_analysis.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/milvus.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/process_json_files.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/processing.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/system.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/transport.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/util.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/vdb/milvus.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client/util/zipkin.py +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client.egg-info/requires.txt +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
- {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/version.py +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import collections
|
|
8
8
|
import glob
|
|
9
|
+
import gzip
|
|
9
10
|
import json
|
|
10
11
|
import logging
|
|
11
12
|
import os
|
|
@@ -93,17 +94,20 @@ def ensure_job_specs(func):
|
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
class LazyLoadedList(collections.abc.Sequence):
|
|
96
|
-
def __init__(self, filepath: str, expected_len: Optional[int] = None):
|
|
97
|
+
def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
|
|
97
98
|
self.filepath = filepath
|
|
98
99
|
self._len: Optional[int] = expected_len # Store pre-calculated length
|
|
99
100
|
self._offsets: Optional[List[int]] = None
|
|
101
|
+
self.compression = compression
|
|
100
102
|
|
|
101
103
|
if self._len == 0:
|
|
102
104
|
self._offsets = []
|
|
103
105
|
|
|
106
|
+
self._open = gzip.open if self.compression == "gzip" else open
|
|
107
|
+
|
|
104
108
|
def __iter__(self) -> Iterator[Any]:
|
|
105
109
|
try:
|
|
106
|
-
with
|
|
110
|
+
with self._open(self.filepath, "rt", encoding="utf-8") as f:
|
|
107
111
|
for line in f:
|
|
108
112
|
yield json.loads(line)
|
|
109
113
|
except FileNotFoundError:
|
|
@@ -120,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
120
124
|
self._offsets = []
|
|
121
125
|
line_count = 0
|
|
122
126
|
try:
|
|
123
|
-
with
|
|
127
|
+
with self._open(self.filepath, "rb") as f:
|
|
124
128
|
while True:
|
|
125
129
|
current_pos = f.tell()
|
|
126
130
|
line = f.readline()
|
|
@@ -144,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
144
148
|
def __len__(self) -> int:
|
|
145
149
|
if self._len is not None:
|
|
146
150
|
return self._len
|
|
151
|
+
|
|
147
152
|
if self._offsets is not None:
|
|
148
153
|
self._len = len(self._offsets)
|
|
149
154
|
return self._len
|
|
150
155
|
self._build_index()
|
|
156
|
+
|
|
151
157
|
return self._len if self._len is not None else 0
|
|
152
158
|
|
|
153
159
|
def __getitem__(self, idx: int) -> Any:
|
|
@@ -170,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
|
|
|
170
176
|
raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
|
|
171
177
|
|
|
172
178
|
try:
|
|
173
|
-
with
|
|
179
|
+
with self._open(self.filepath, "rb") as f:
|
|
174
180
|
f.seek(self._offsets[idx])
|
|
175
181
|
line_bytes = f.readline()
|
|
176
182
|
return json.loads(line_bytes.decode("utf-8"))
|
|
@@ -455,6 +461,8 @@ class Ingestor:
|
|
|
455
461
|
clean_source_basename = get_valid_filename(os.path.basename(source_name))
|
|
456
462
|
file_name, file_ext = os.path.splitext(clean_source_basename)
|
|
457
463
|
file_suffix = f".{file_ext.strip('.')}.results.jsonl"
|
|
464
|
+
if self._output_config["compression"] == "gzip":
|
|
465
|
+
file_suffix += ".gz"
|
|
458
466
|
jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
|
|
459
467
|
|
|
460
468
|
num_items_saved = save_document_results_to_jsonl(
|
|
@@ -462,10 +470,13 @@ class Ingestor:
|
|
|
462
470
|
jsonl_filepath,
|
|
463
471
|
source_name,
|
|
464
472
|
ensure_parent_dir_exists=False,
|
|
473
|
+
compression=self._output_config["compression"],
|
|
465
474
|
)
|
|
466
475
|
|
|
467
476
|
if num_items_saved > 0:
|
|
468
|
-
results = LazyLoadedList(
|
|
477
|
+
results = LazyLoadedList(
|
|
478
|
+
jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
|
|
479
|
+
)
|
|
469
480
|
if results_lock:
|
|
470
481
|
with results_lock:
|
|
471
482
|
final_results_payload_list.append(results)
|
|
@@ -1068,6 +1079,7 @@ class Ingestor:
|
|
|
1068
1079
|
self,
|
|
1069
1080
|
output_directory: Optional[str] = None,
|
|
1070
1081
|
cleanup: bool = True,
|
|
1082
|
+
compression: Optional[str] = "gzip",
|
|
1071
1083
|
) -> "Ingestor":
|
|
1072
1084
|
"""Configures the Ingestor to save results to disk instead of memory.
|
|
1073
1085
|
|
|
@@ -1092,6 +1104,12 @@ class Ingestor:
|
|
|
1092
1104
|
when the Ingestor's context is exited (i.e., when used in a `with`
|
|
1093
1105
|
statement).
|
|
1094
1106
|
Defaults to True.
|
|
1107
|
+
compression : str, optional
|
|
1108
|
+
The compression algorithm to use for the saved result files.
|
|
1109
|
+
Currently, the only supported value is `'gzip'`. To disable
|
|
1110
|
+
compression, set this parameter to `None`. Defaults to `'gzip'`,
|
|
1111
|
+
which significantly reduces the disk space required for results.
|
|
1112
|
+
When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
|
|
1095
1113
|
|
|
1096
1114
|
Returns
|
|
1097
1115
|
-------
|
|
@@ -1107,6 +1125,7 @@ class Ingestor:
|
|
|
1107
1125
|
self._output_config = {
|
|
1108
1126
|
"output_directory": output_directory,
|
|
1109
1127
|
"cleanup": cleanup,
|
|
1128
|
+
"compression": compression,
|
|
1110
1129
|
}
|
|
1111
1130
|
ensure_directory_with_permissions(output_directory)
|
|
1112
1131
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import gzip
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -6,6 +7,7 @@ import re
|
|
|
6
7
|
from typing import Any
|
|
7
8
|
from typing import Dict
|
|
8
9
|
from typing import List
|
|
10
|
+
from typing import Optional
|
|
9
11
|
from typing import Tuple
|
|
10
12
|
|
|
11
13
|
try:
|
|
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
|
|
|
33
35
|
jsonl_output_filepath: str,
|
|
34
36
|
original_source_name_for_log: str,
|
|
35
37
|
ensure_parent_dir_exists: bool = True,
|
|
38
|
+
compression: Optional[str] = None,
|
|
36
39
|
) -> Tuple[int, Dict[str, str]]:
|
|
37
40
|
"""
|
|
38
41
|
Saves a list of extraction items (for a single source document) to a JSON Lines file.
|
|
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
|
|
|
50
53
|
if parent_dir:
|
|
51
54
|
os.makedirs(parent_dir, exist_ok=True)
|
|
52
55
|
|
|
56
|
+
if compression == "gzip":
|
|
57
|
+
open_func = gzip.open
|
|
58
|
+
elif compression is None:
|
|
59
|
+
open_func = open
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Unsupported compression type: {compression}")
|
|
62
|
+
|
|
53
63
|
with io.BytesIO() as buffer:
|
|
54
64
|
for extraction_item in doc_response_data:
|
|
55
65
|
if USING_ORJSON:
|
|
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
|
|
|
60
70
|
|
|
61
71
|
count_items_written = len(doc_response_data)
|
|
62
72
|
|
|
63
|
-
with
|
|
73
|
+
with open_func(jsonl_output_filepath, "wb") as f_jsonl:
|
|
64
74
|
f_jsonl.write(full_byte_content)
|
|
65
75
|
|
|
66
76
|
logger.info(
|
|
File without changes
|
{nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/MANIFEST.in
RENAMED
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.1.dev20251001}/src/version.py
RENAMED
|
File without changes
|