nv-ingest-client 2025.9.30.dev20250930__tar.gz → 2025.10.2.dev20251002__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (61) hide show
  1. {nv_ingest_client-2025.9.30.dev20250930/src/nv_ingest_client.egg-info → nv_ingest_client-2025.10.2.dev20251002}/PKG-INFO +1 -1
  2. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/client/interface.py +24 -5
  3. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/client/util/processing.py +11 -1
  4. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  5. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/LICENSE +0 -0
  6. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/MANIFEST.in +0 -0
  7. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/README.md +0 -0
  8. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/pyproject.toml +0 -0
  9. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/setup.cfg +0 -0
  10. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/__init__.py +0 -0
  11. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/cli/__init__.py +0 -0
  12. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  13. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/cli/util/click.py +0 -0
  14. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/cli/util/processing.py +0 -0
  15. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/cli/util/system.py +0 -0
  16. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/client/__init__.py +0 -0
  17. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/client/client.py +0 -0
  18. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  19. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/__init__.py +0 -0
  20. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  21. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
  22. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  23. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  24. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  25. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  26. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  27. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  28. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  29. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  30. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  31. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  32. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  33. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  34. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  35. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  36. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  37. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/udf.py +0 -0
  38. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  39. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/__init__.py +0 -0
  40. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/dataset.py +0 -0
  41. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/document_analysis.py +0 -0
  42. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  43. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  44. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  45. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/milvus.py +0 -0
  46. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/process_json_files.py +0 -0
  47. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/processing.py +0 -0
  48. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/system.py +0 -0
  49. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/transport.py +0 -0
  50. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/util.py +0 -0
  51. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  52. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  53. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/vdb/milvus.py +0 -0
  54. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  55. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client/util/zipkin.py +0 -0
  56. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
  57. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  58. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  59. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  60. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  61. {nv_ingest_client-2025.9.30.dev20250930 → nv_ingest_client-2025.10.2.dev20251002}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.30.dev20250930
3
+ Version: 2025.10.2.dev20251002
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -6,6 +6,7 @@
6
6
 
7
7
  import collections
8
8
  import glob
9
+ import gzip
9
10
  import json
10
11
  import logging
11
12
  import os
@@ -93,17 +94,20 @@ def ensure_job_specs(func):
93
94
 
94
95
 
95
96
  class LazyLoadedList(collections.abc.Sequence):
96
- def __init__(self, filepath: str, expected_len: Optional[int] = None):
97
+ def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
97
98
  self.filepath = filepath
98
99
  self._len: Optional[int] = expected_len # Store pre-calculated length
99
100
  self._offsets: Optional[List[int]] = None
101
+ self.compression = compression
100
102
 
101
103
  if self._len == 0:
102
104
  self._offsets = []
103
105
 
106
+ self._open = gzip.open if self.compression == "gzip" else open
107
+
104
108
  def __iter__(self) -> Iterator[Any]:
105
109
  try:
106
- with open(self.filepath, "r", encoding="utf-8") as f:
110
+ with self._open(self.filepath, "rt", encoding="utf-8") as f:
107
111
  for line in f:
108
112
  yield json.loads(line)
109
113
  except FileNotFoundError:
@@ -120,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
120
124
  self._offsets = []
121
125
  line_count = 0
122
126
  try:
123
- with open(self.filepath, "rb") as f:
127
+ with self._open(self.filepath, "rb") as f:
124
128
  while True:
125
129
  current_pos = f.tell()
126
130
  line = f.readline()
@@ -144,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
144
148
  def __len__(self) -> int:
145
149
  if self._len is not None:
146
150
  return self._len
151
+
147
152
  if self._offsets is not None:
148
153
  self._len = len(self._offsets)
149
154
  return self._len
150
155
  self._build_index()
156
+
151
157
  return self._len if self._len is not None else 0
152
158
 
153
159
  def __getitem__(self, idx: int) -> Any:
@@ -170,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
170
176
  raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
171
177
 
172
178
  try:
173
- with open(self.filepath, "rb") as f:
179
+ with self._open(self.filepath, "rb") as f:
174
180
  f.seek(self._offsets[idx])
175
181
  line_bytes = f.readline()
176
182
  return json.loads(line_bytes.decode("utf-8"))
@@ -455,6 +461,8 @@ class Ingestor:
455
461
  clean_source_basename = get_valid_filename(os.path.basename(source_name))
456
462
  file_name, file_ext = os.path.splitext(clean_source_basename)
457
463
  file_suffix = f".{file_ext.strip('.')}.results.jsonl"
464
+ if self._output_config["compression"] == "gzip":
465
+ file_suffix += ".gz"
458
466
  jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
459
467
 
460
468
  num_items_saved = save_document_results_to_jsonl(
@@ -462,10 +470,13 @@ class Ingestor:
462
470
  jsonl_filepath,
463
471
  source_name,
464
472
  ensure_parent_dir_exists=False,
473
+ compression=self._output_config["compression"],
465
474
  )
466
475
 
467
476
  if num_items_saved > 0:
468
- results = LazyLoadedList(jsonl_filepath, expected_len=num_items_saved)
477
+ results = LazyLoadedList(
478
+ jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
479
+ )
469
480
  if results_lock:
470
481
  with results_lock:
471
482
  final_results_payload_list.append(results)
@@ -1068,6 +1079,7 @@ class Ingestor:
1068
1079
  self,
1069
1080
  output_directory: Optional[str] = None,
1070
1081
  cleanup: bool = True,
1082
+ compression: Optional[str] = "gzip",
1071
1083
  ) -> "Ingestor":
1072
1084
  """Configures the Ingestor to save results to disk instead of memory.
1073
1085
 
@@ -1092,6 +1104,12 @@ class Ingestor:
1092
1104
  when the Ingestor's context is exited (i.e., when used in a `with`
1093
1105
  statement).
1094
1106
  Defaults to True.
1107
+ compression : str, optional
1108
+ The compression algorithm to use for the saved result files.
1109
+ Currently, the only supported value is `'gzip'`. To disable
1110
+ compression, set this parameter to `None`. Defaults to `'gzip'`,
1111
+ which significantly reduces the disk space required for results.
1112
+ When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
1095
1113
 
1096
1114
  Returns
1097
1115
  -------
@@ -1107,6 +1125,7 @@ class Ingestor:
1107
1125
  self._output_config = {
1108
1126
  "output_directory": output_directory,
1109
1127
  "cleanup": cleanup,
1128
+ "compression": compression,
1110
1129
  }
1111
1130
  ensure_directory_with_permissions(output_directory)
1112
1131
 
@@ -1,3 +1,4 @@
1
+ import gzip
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -6,6 +7,7 @@ import re
6
7
  from typing import Any
7
8
  from typing import Dict
8
9
  from typing import List
10
+ from typing import Optional
9
11
  from typing import Tuple
10
12
 
11
13
  try:
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
33
35
  jsonl_output_filepath: str,
34
36
  original_source_name_for_log: str,
35
37
  ensure_parent_dir_exists: bool = True,
38
+ compression: Optional[str] = None,
36
39
  ) -> Tuple[int, Dict[str, str]]:
37
40
  """
38
41
  Saves a list of extraction items (for a single source document) to a JSON Lines file.
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
50
53
  if parent_dir:
51
54
  os.makedirs(parent_dir, exist_ok=True)
52
55
 
56
+ if compression == "gzip":
57
+ open_func = gzip.open
58
+ elif compression is None:
59
+ open_func = open
60
+ else:
61
+ raise ValueError(f"Unsupported compression type: {compression}")
62
+
53
63
  with io.BytesIO() as buffer:
54
64
  for extraction_item in doc_response_data:
55
65
  if USING_ORJSON:
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
60
70
 
61
71
  count_items_written = len(doc_response_data)
62
72
 
63
- with open(jsonl_output_filepath, "wb") as f_jsonl:
73
+ with open_func(jsonl_output_filepath, "wb") as f_jsonl:
64
74
  f_jsonl.write(full_byte_content)
65
75
 
66
76
  logger.info(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.9.30.dev20250930
3
+ Version: 2025.10.2.dev20251002
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License