nv-ingest-client 2025.10.7.dev20251007__py3-none-any.whl → 2025.10.8.dev20251008__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -0,0 +1,384 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ import time
10
+ import os
11
+ import io
12
+ import base64
13
+ from collections import defaultdict
14
+ from typing import Any, Dict, List, Tuple
15
+
16
+ from tqdm import tqdm
17
+
18
+ # Reuse existing CLI utilities to avoid duplicating behavior
19
+ from concurrent.futures import as_completed
20
+ from nv_ingest_client.util.util import check_ingest_result
21
+ from PIL import Image
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class IngestJobHandler:
27
+ """
28
+ A modular job handler that mirrors the CLI's create_and_process_jobs flow,
29
+ so the same proven scheduling/retry behavior can be reused by other entry points.
30
+
31
+ Usage:
32
+ handler = IngestJobHandler(client, files, tasks, output_dir, batch_size)
33
+ total_files, trace_times, total_pages, trace_ids = handler.run()
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ client: Any,
39
+ files: List[str],
40
+ tasks: Dict[str, Any],
41
+ output_directory: str,
42
+ batch_size: int,
43
+ fail_on_error: bool = False,
44
+ save_images_separately: bool = False,
45
+ show_progress: bool = True,
46
+ show_telemetry: bool = False,
47
+ job_queue_id: str = "ingest_task_queue",
48
+ ) -> None:
49
+ self.client = client
50
+ self.files = files
51
+ self.tasks = tasks
52
+ self.output_directory = output_directory
53
+ self.batch_size = batch_size
54
+ self.fail_on_error = fail_on_error
55
+ self.save_images_separately = save_images_separately
56
+ self.show_progress = show_progress
57
+ self.show_telemetry = show_telemetry
58
+ self.job_queue_id = job_queue_id
59
+ self._pbar = None
60
+ # Internal state used across iterations
61
+ self._retry_job_ids: List[str] = []
62
+ self._processed: int = 0
63
+ self._job_ids_batch: List[str] = []
64
+ self._job_id_map: Dict[str, str] = {}
65
+ self._trace_times: Dict[str, List[float]] = defaultdict(list)
66
+ # Constants
67
+ self._IMAGE_TYPES: set = {"png", "bmp", "jpeg", "jpg", "tiff"}
68
+
69
+ # ---------------------------
70
+ # Progress bar helpers
71
+ # ---------------------------
72
+ def _init_progress_bar(self, total: int) -> None:
73
+ if self.show_progress:
74
+ self._pbar = tqdm(total=total, desc="Processing files", unit="file")
75
+ else:
76
+ self._pbar = None
77
+
78
+ def _update_progress(self, n: int = 1, pages_per_sec: float | None = None) -> None:
79
+ if not self._pbar:
80
+ return
81
+ if pages_per_sec is not None:
82
+ self._pbar.set_postfix(pages_per_sec=f"{pages_per_sec:.2f}")
83
+ self._pbar.update(n)
84
+
85
+ def _close_progress_bar(self) -> None:
86
+ if self._pbar:
87
+ self._pbar.close()
88
+ self._pbar = None
89
+
90
+ def _generate_job_batch_for_iteration(self) -> None:
91
+ """
92
+ Build the next batch of jobs for processing and submit newly created jobs.
93
+
94
+ This method mirrors the CLI batching semantics: it prioritizes retry jobs,
95
+ then creates new jobs up to the given ``batch_size``, submits those new jobs
96
+ asynchronously to the configured queue, and returns the combined list of
97
+ job indices for this iteration. It also updates the internal progress bar
98
+ when configured and advances the processed-file counter.
99
+
100
+ Side Effects
101
+ ------------
102
+ - Populates/overwrites ``self._job_ids_batch`` with the ordered job indices to
103
+ process this iteration (``retry`` first, then newly created jobs).
104
+ - Updates ``self._job_id_map`` with any new mappings from job index to source file path
105
+ for jobs created in this iteration.
106
+
107
+ Raises
108
+ ------
109
+ RuntimeError
110
+ If one or more job specs cannot be created (e.g., unreadable files)
111
+ and ``self.fail_on_error`` is True.
112
+
113
+ Notes
114
+ -----
115
+ - Side effects:
116
+ - Creates JobSpecs via ``self.client.create_jobs_for_batch(...)``.
117
+ - Submits newly created jobs via ``self.client.submit_job_async(..., self.job_queue_id)``.
118
+ - Updates the class-owned progress bar (``self._pbar``) to account for
119
+ missing jobs when some files fail to produce specs and
120
+ ``self.fail_on_error`` is False.
121
+ - This method does not perform fetching; it only prepares and submits
122
+ jobs for the current iteration.
123
+ - The ``processed`` counter advances by the number of files attempted in
124
+ this iteration, even if some job specs are missing (unless
125
+ ``self.fail_on_error`` is True).
126
+
127
+ Examples
128
+ --------
129
+ >>> handler = IngestJobHandler(client, files, tasks, "/tmp/out", batch_size=32)
130
+ >>> retry_ids = []
131
+ >>> handler._generate_job_batch_for_iteration()
132
+ >>> len(handler._job_ids_batch) <= 32
133
+ True
134
+ """
135
+ job_indices: List[str] = []
136
+ job_index_map_updates: Dict[str, str] = {}
137
+ cur_job_count: int = 0
138
+
139
+ if self._retry_job_ids:
140
+ job_indices.extend(self._retry_job_ids)
141
+ cur_job_count = len(job_indices)
142
+
143
+ if (cur_job_count < self.batch_size) and (self._processed < len(self.files)):
144
+ new_job_count: int = min(self.batch_size - cur_job_count, len(self.files) - self._processed)
145
+ batch_files: List[str] = self.files[self._processed : self._processed + new_job_count]
146
+
147
+ new_job_indices: List[str] = self.client.create_jobs_for_batch(batch_files, self.tasks)
148
+ if len(new_job_indices) != new_job_count:
149
+ missing_jobs: int = new_job_count - len(new_job_indices)
150
+ error_msg: str = (
151
+ f"Missing {missing_jobs} job specs -- this is likely due to bad reads or file corruption"
152
+ )
153
+ logger.warning(error_msg)
154
+
155
+ if self.fail_on_error:
156
+ raise RuntimeError(error_msg)
157
+
158
+ if self._pbar:
159
+ self._pbar.update(missing_jobs)
160
+
161
+ job_index_map_updates = {job_index: file for job_index, file in zip(new_job_indices, batch_files)}
162
+ self._processed += new_job_count
163
+ # Submit newly created jobs asynchronously to the configured queue
164
+ _ = self.client.submit_job_async(new_job_indices, self.job_queue_id)
165
+ job_indices.extend(new_job_indices)
166
+
167
+ # Save into class state
168
+ self._job_ids_batch = job_indices
169
+ # Merge new mappings (do not drop existing entries for retry jobs)
170
+ self._job_id_map.update(job_index_map_updates)
171
+
172
+ def _handle_future_result(self, future, timeout: int = 10):
173
+ """
174
+ Handle the result of a completed future job and process annotations.
175
+
176
+ Parameters
177
+ ----------
178
+ future : concurrent.futures.Future
179
+ Future representing an asynchronous job.
180
+ timeout : int, optional
181
+ Maximum seconds to wait for the future result.
182
+
183
+ Returns
184
+ -------
185
+ Tuple[Dict[str, Any], str]
186
+ The decoded result dictionary and the trace_id for the job.
187
+
188
+ Raises
189
+ ------
190
+ RuntimeError
191
+ If the job result indicates failure per check_ingest_result.
192
+ """
193
+ result, _, trace_id = future.result(timeout=timeout)[0]
194
+ if ("annotations" in result) and result["annotations"]:
195
+ annotations = result["annotations"]
196
+ for key, value in annotations.items():
197
+ logger.debug(f"Annotation: {key} -> {json.dumps(value, indent=2)}")
198
+
199
+ failed, description = check_ingest_result(result)
200
+ if failed:
201
+ raise RuntimeError(f"Ingest job failed: {description}")
202
+
203
+ return result, trace_id
204
+
205
+ def _process_response(self, response: Dict[str, Any]) -> None:
206
+ """
207
+ Extract trace timing entries from a response and accumulate per-stage elapsed times
208
+ into ``self._trace_times``.
209
+
210
+ Parameters
211
+ ----------
212
+ response : Dict[str, Any]
213
+ Full response payload containing an optional ``trace`` dictionary with
214
+ entry/exit timestamps.
215
+ """
216
+ trace_data: Dict[str, Any] = response.get("trace", {})
217
+ for key, entry_time in trace_data.items():
218
+ if "entry" in key:
219
+ exit_key: str = key.replace("entry", "exit")
220
+ exit_time: Any = trace_data.get(exit_key)
221
+ if exit_time:
222
+ stage_parts = key.split("::")
223
+ if len(stage_parts) >= 3:
224
+ stage_name: str = stage_parts[2]
225
+ elapsed_time: int = exit_time - entry_time
226
+ self._trace_times[stage_name].append(elapsed_time)
227
+
228
+ def _save_response_data(
229
+ self, response: Dict[str, Any], output_directory: str, images_to_disk: bool = False
230
+ ) -> None:
231
+ """
232
+ Save the response data into categorized metadata JSON files and optionally save images to disk.
233
+
234
+ Parameters
235
+ ----------
236
+ response : Dict[str, Any]
237
+ Full response payload with a "data" list of documents.
238
+ output_directory : str
239
+ Output directory where per-type metadata JSON files (and any media) are written.
240
+ images_to_disk : bool, optional
241
+ If True, decode and write image contents to disk and replace content with a file URL.
242
+ """
243
+ if ("data" not in response) or (not response["data"]):
244
+ logger.debug("Data is not in the response or response.data is empty")
245
+ return
246
+
247
+ response_data = response["data"]
248
+ if not isinstance(response_data, list) or len(response_data) == 0:
249
+ logger.debug("Response data is not a list or the list is empty.")
250
+ return
251
+
252
+ doc_meta_base = response_data[0]["metadata"]
253
+ source_meta = doc_meta_base["source_metadata"]
254
+ doc_name = source_meta["source_id"]
255
+ clean_doc_name = os.path.basename(doc_name)
256
+ output_name = f"{clean_doc_name}.metadata.json"
257
+
258
+ # Organize by document type
259
+ doc_map: Dict[str, List[Dict[str, Any]]] = {}
260
+ for document in response_data:
261
+ meta: Dict[str, Any] = document.get("metadata", {})
262
+ content_meta: Dict[str, Any] = meta.get("content_metadata", {})
263
+ doc_type: str = content_meta.get("type", "unknown")
264
+ doc_map.setdefault(doc_type, []).append(document)
265
+
266
+ for doc_type, documents in doc_map.items():
267
+ doc_type_path = os.path.join(output_directory, doc_type)
268
+ os.makedirs(doc_type_path, exist_ok=True)
269
+
270
+ if doc_type in ("image", "structured") and images_to_disk:
271
+ for i, doc in enumerate(documents):
272
+ meta: Dict[str, Any] = doc.get("metadata", {})
273
+ image_content = meta.get("content")
274
+ image_type = (
275
+ meta.get("image_metadata", {}).get("image_type", "png").lower()
276
+ if doc_type == "image"
277
+ else "png"
278
+ )
279
+
280
+ if image_content and image_type in self._IMAGE_TYPES:
281
+ try:
282
+ image_data = base64.b64decode(image_content)
283
+ image = Image.open(io.BytesIO(image_data))
284
+
285
+ image_ext = "jpg" if image_type == "jpeg" else image_type
286
+ image_filename = f"{clean_doc_name}_{i}.{image_ext}"
287
+ image_output_path = os.path.join(doc_type_path, "media", image_filename)
288
+ os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
289
+ image.save(image_output_path, format=image_ext.upper())
290
+
291
+ meta["content"] = ""
292
+ meta["content_url"] = os.path.realpath(image_output_path)
293
+ logger.debug(f"Saved image to {image_output_path}")
294
+ except Exception as e:
295
+ logger.error(f"Failed to save image {i} for {clean_doc_name}: {e}")
296
+
297
+ # Write the metadata JSON file for this type
298
+ with open(os.path.join(doc_type_path, output_name), "w") as f:
299
+ f.write(json.dumps(documents, indent=2))
300
+
301
+ def run(self) -> Tuple[int, Dict[str, List[float]], int, Dict[str, str]]:
302
+ total_files: int = len(self.files)
303
+ total_pages_processed: int = 0
304
+ trace_ids: Dict[str, str] = defaultdict(list) # type: ignore
305
+ failed_jobs: List[str] = []
306
+ retry_counts: Dict[str, int] = defaultdict(int)
307
+
308
+ start_time_ns: int = time.time_ns()
309
+ self._init_progress_bar(total_files)
310
+ try:
311
+ self._processed = 0
312
+ while (self._processed < len(self.files)) or self._retry_job_ids:
313
+ # Create a batch (retries first, then new jobs up to batch_size)
314
+ self._generate_job_batch_for_iteration()
315
+ job_id_map = self._job_id_map
316
+ self._retry_job_ids = []
317
+
318
+ futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
319
+ for future in as_completed(futures_dict.keys()):
320
+ try:
321
+ # Block as each future completes; this mirrors CLI behavior
322
+ future_response, trace_id = self._handle_future_result(future)
323
+ job_id: str = futures_dict[future]
324
+ trace_ids[job_id_map[job_id]] = trace_id
325
+
326
+ first_page_metadata = future_response["data"][0]["metadata"]
327
+ file_page_counts: Dict[str, int] = {
328
+ first_page_metadata["source_metadata"]["source_name"]: first_page_metadata[
329
+ "content_metadata"
330
+ ]["hierarchy"]["page_count"]
331
+ }
332
+
333
+ if self.output_directory:
334
+ self._save_response_data(
335
+ future_response,
336
+ self.output_directory,
337
+ images_to_disk=self.save_images_separately,
338
+ )
339
+
340
+ total_pages_processed += file_page_counts[list(file_page_counts.keys())[0]]
341
+ elapsed_time: float = (time.time_ns() - start_time_ns) / 1e9
342
+ if elapsed_time > 0:
343
+ pages_per_sec: float = total_pages_processed / elapsed_time
344
+ else:
345
+ pages_per_sec = None
346
+
347
+ self._process_response(future_response)
348
+
349
+ except TimeoutError:
350
+ job_id = futures_dict[future]
351
+ src_name = job_id_map[job_id]
352
+ retry_counts[src_name] += 1
353
+ self._retry_job_ids.append(job_id)
354
+ except json.JSONDecodeError as e:
355
+ job_id = futures_dict[future]
356
+ src_name = job_id_map[job_id]
357
+ logger.error(f"Decoding error while processing {job_id}({src_name}): {e}")
358
+ failed_jobs.append(f"{job_id}::{src_name}")
359
+ except RuntimeError as e:
360
+ job_id = futures_dict[future]
361
+ src_name = job_id_map[job_id]
362
+ logger.error(f"Error while processing '{job_id}' - ({src_name}):\n{e}")
363
+ failed_jobs.append(f"{job_id}::{src_name}")
364
+ except Exception as e:
365
+ job_id = futures_dict[future]
366
+ src_name = job_id_map[job_id]
367
+ logger.exception(f"Unhandled error while processing {job_id}({src_name}): {e}")
368
+ failed_jobs.append(f"{job_id}::{src_name}")
369
+ finally:
370
+ # Do not update the pbar if this job is going to be retried
371
+ if futures_dict[future] not in self._retry_job_ids:
372
+ self._update_progress(1, pages_per_sec)
373
+ finally:
374
+ self._close_progress_bar()
375
+
376
+ # Optionally print telemetry summary
377
+ if self.show_telemetry and hasattr(self.client, "summarize_telemetry"):
378
+ try:
379
+ summary = self.client.summarize_telemetry()
380
+ logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
381
+ except Exception:
382
+ pass
383
+
384
+ return total_files, self._trace_times, total_pages_processed, trace_ids
@@ -404,10 +404,11 @@ class Ingestor:
404
404
  save_to_disk: bool = False,
405
405
  **kwargs: Any,
406
406
  ) -> Union[
407
- List[List[Dict[str, Any]]], # In-memory: List of (response['data'] for each doc)
407
+ List[List[Dict[str, Any]]], # In-memory: List of response['data'] for each doc
408
+ List[Dict[str, Any]], # In-memory: Full response envelopes when return_full_response=True
408
409
  List[LazyLoadedList], # Disk: List of proxies, one per original doc
409
410
  Tuple[
410
- Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
411
+ Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
411
412
  List[Tuple[str, str]],
412
413
  ],
413
414
  ]: # noqa: E501
@@ -423,13 +424,16 @@ class Ingestor:
423
424
  **kwargs : Any
424
425
  Additional keyword arguments for the underlying client methods. Supported keys:
425
426
  'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
426
- 'data_only', 'verbose'. Unrecognized keys are passed through to
427
- process_jobs_concurrently.
427
+ 'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
428
+ through to process_jobs_concurrently.
428
429
 
429
430
  Returns
430
431
  -------
431
- results : list of dict
432
- List of successful job results when `return_failures` is False.
432
+ results : list
433
+ When `return_failures` is False:
434
+ - Default: List of response['data'] per job (list[list[dict]]).
435
+ - If `return_full_response=True`: List of full response envelopes (each dict
436
+ contains keys like 'data', 'trace', 'annotations').
433
437
  results, failures : tuple (list of dict, list of tuple of str)
434
438
  Tuple containing successful results and failure information when `return_failures` is True.
435
439
  """
@@ -549,6 +553,22 @@ class Ingestor:
549
553
 
550
554
  proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
551
555
 
556
+ # Telemetry controls (optional)
557
+ enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
558
+ show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
559
+ if show_telemetry is None:
560
+ # Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
561
+ try:
562
+ show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
563
+ except ValueError:
564
+ show_telemetry = False
565
+ # If user explicitly wants to show telemetry but did not specify enable_telemetry,
566
+ # ensure collection is enabled so summary isn't empty.
567
+ if enable_telemetry is None and show_telemetry:
568
+ enable_telemetry = True
569
+ if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
570
+ self._client.enable_telemetry(bool(enable_telemetry))
571
+
552
572
  results, failures = self._client.process_jobs_concurrently(
553
573
  job_indices=self._job_ids,
554
574
  job_queue_id=self._job_queue_id,
@@ -611,6 +631,16 @@ class Ingestor:
611
631
  logger.info("Purging saved results from disk after successful VDB upload.")
612
632
  self._purge_saved_results(results)
613
633
 
634
+ # Print telemetry summary if requested
635
+ if show_telemetry:
636
+ try:
637
+ summary = self._client.summarize_telemetry()
638
+ # Print to stdout and log for convenience
639
+ print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
640
+ logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
641
+ except Exception:
642
+ pass
643
+
614
644
  return (results, failures) if return_failures else results
615
645
 
616
646
  def ingest_async(self, **kwargs: Any) -> Future:
@@ -25,10 +25,10 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
25
25
  from nv_ingest_client.cli.util.click import click_validate_batch_size
26
26
  from nv_ingest_client.cli.util.click import click_validate_file_exists
27
27
  from nv_ingest_client.cli.util.click import click_validate_task
28
- from nv_ingest_client.cli.util.processing import create_and_process_jobs
29
28
  from nv_ingest_client.cli.util.processing import report_statistics
30
29
  from nv_ingest_client.cli.util.system import configure_logging
31
30
  from nv_ingest_client.client import NvIngestClient
31
+ from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
32
32
  from nv_ingest_client.util.dataset import get_dataset_files
33
33
  from nv_ingest_client.util.dataset import get_dataset_statistics
34
34
  from nv_ingest_client.util.system import ensure_directory_with_permissions
@@ -290,15 +290,18 @@ def main(
290
290
  )
291
291
 
292
292
  start_time_ns = time.time_ns()
293
- (total_files, trace_times, pages_processed, trace_ids) = create_and_process_jobs(
294
- files=docs,
293
+ handler = IngestJobHandler(
295
294
  client=ingest_client,
295
+ files=docs,
296
296
  tasks=task,
297
297
  output_directory=output_directory,
298
298
  batch_size=batch_size,
299
299
  fail_on_error=fail_on_error,
300
300
  save_images_separately=save_images_separately,
301
+ show_progress=True,
302
+ show_telemetry=True,
301
303
  )
304
+ (total_files, trace_times, pages_processed, trace_ids) = handler.run()
302
305
 
303
306
  report_statistics(start_time_ns, trace_times, pages_processed, total_files)
304
307
 
@@ -86,7 +86,7 @@ class ExtractTask(Task):
86
86
  extract_page_as_image: bool = False,
87
87
  text_depth: str = "document",
88
88
  paddle_output_format: str = "pseudo_markdown",
89
- table_output_format: str = "pseudo_markdown",
89
+ table_output_format: str = "markdown",
90
90
  ) -> None:
91
91
  """
92
92
  Setup Extract Task Config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.10.7.dev20251007
3
+ Version: 2025.10.8.dev20251008
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -1,13 +1,14 @@
1
1
  nv_ingest_client/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
2
- nv_ingest_client/nv_ingest_cli.py,sha256=GG7x_fe423NHQRDmpNcTtNI2P_g1xgg9SQ5JjbdBAIU,13592
2
+ nv_ingest_client/nv_ingest_cli.py,sha256=8HpbU6l0i19M9kjZdZKbf03z-CZIZoikigZuo9wD77g,13693
3
3
  nv_ingest_client/cli/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
4
  nv_ingest_client/cli/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
5
5
  nv_ingest_client/cli/util/click.py,sha256=YjQU1uF148FU5D3ozC2m1kkfOOJxO1U8U552-T8PjU4,20029
6
- nv_ingest_client/cli/util/processing.py,sha256=7mXPjjNjLzWQY7WSxpm6et6ZEZOj0GYhLqvz-jx6MO4,24002
6
+ nv_ingest_client/cli/util/processing.py,sha256=ULGCYQF1RTDQV_b35YM1WQRqIjR2wQRMJWu41DogagE,6259
7
7
  nv_ingest_client/cli/util/system.py,sha256=AQLq0DD2Ns8jRanrKu1tmVBKPA9rl-F3-ZsGI6FXLqE,1105
8
8
  nv_ingest_client/client/__init__.py,sha256=eEX9l1qmkLH2lAAZU3eP17SCV06ZjjrshHAB_xbboHA,375
9
- nv_ingest_client/client/client.py,sha256=wgPeLUByBNcQRkl1FXe7neHNNC5eY2sVve99g5sW41k,65068
10
- nv_ingest_client/client/interface.py,sha256=LeRtd92y0ab1K1f1nOB6_HpVWHJ_pPcIoLcQJGbjCSs,47218
9
+ nv_ingest_client/client/client.py,sha256=egPegAoe8sVYxWWbNl1V5xJdaABJxgVxX7AHRoa049w,76058
10
+ nv_ingest_client/client/ingest_job_handler.py,sha256=i2PC5AUaglN5aGAqE4Nrbk95TyLYN7P_Nwba0C_19xA,16810
11
+ nv_ingest_client/client/interface.py,sha256=c1SRgS_ryz-DAYRPr8OIW0HpWzUx-GsnlfEK2_K7fZg,48897
11
12
  nv_ingest_client/client/util/processing.py,sha256=Ky7x7QbLn3BlgYwmrmoIc-o1VwmlmrcP9tn7GVTi0t0,2502
12
13
  nv_ingest_client/primitives/__init__.py,sha256=3rbpLCI7Bl0pntGatAxXD_V01y6dcLhHFheI3wqet-I,269
13
14
  nv_ingest_client/primitives/jobs/__init__.py,sha256=-yohgHv3LcCtSleHSaxjv1oO7nNcMCjN3ZYoOkIypIk,469
@@ -19,7 +20,7 @@ nv_ingest_client/primitives/tasks/caption.py,sha256=I1nOpfGb1Ts7QsElwfayhw-F_UcY
19
20
  nv_ingest_client/primitives/tasks/chart_extraction.py,sha256=s5hsljgSXxQMZHGekpAg6OYJ9k3-DHk5NmFpvtKJ6Zs,1493
20
21
  nv_ingest_client/primitives/tasks/dedup.py,sha256=qort6p3t6ZJuK_74sfOOLp3vMT3hkB5DAu3467WenyY,1719
21
22
  nv_ingest_client/primitives/tasks/embed.py,sha256=I6Irmvm1Qj9oqzDGSgfykCtfz8pz9LNxiXO-t29nXv8,5916
22
- nv_ingest_client/primitives/tasks/extract.py,sha256=yJEMGIiquhPlIofE6ERbM-U5tXk-GjZvvnnWOnU7YOA,9335
23
+ nv_ingest_client/primitives/tasks/extract.py,sha256=bRriVkQyXN-UwzprHIt4Lp0iwmAojLEXqBb-IUrf3vY,9328
23
24
  nv_ingest_client/primitives/tasks/filter.py,sha256=wjcfSBGhdEyPh2tf42NMcyKZziigm24CO9B4obpQytU,2618
24
25
  nv_ingest_client/primitives/tasks/infographic_extraction.py,sha256=SyTjZQbdVA3QwM5yVm4fUzE4Gu4zm4tAfNLDZMvySV8,1537
25
26
  nv_ingest_client/primitives/tasks/split.py,sha256=8UkB3EialsOTEbsOZLxzmnDIfTJzC6uvjNv21IbgAVA,2332
@@ -46,9 +47,9 @@ nv_ingest_client/util/vdb/__init__.py,sha256=ZmoEzeM9LzwwrVvu_DVUnjRNx-x8ahkNeIr
46
47
  nv_ingest_client/util/vdb/adt_vdb.py,sha256=UubzAMSfyrqqpD-OQErpBs25hC2Mw8zGZ4waenGXPOk,515
47
48
  nv_ingest_client/util/vdb/milvus.py,sha256=dYXszrWdwYYASBW6t8lMI6QK9-BzhV6HAUYjt3cIDsE,78602
48
49
  nv_ingest_client/util/vdb/opensearch.py,sha256=I4FzF95VWCOkyzhfm-szdfK1Zd9ugUc8AxxpAdEMWGE,7538
49
- nv_ingest_client-2025.10.7.dev20251007.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
- nv_ingest_client-2025.10.7.dev20251007.dist-info/METADATA,sha256=405VU6NsNx7hRRd_QqvRZwNEuX1KtYg6nPFIPs9DL_Y,30626
51
- nv_ingest_client-2025.10.7.dev20251007.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
- nv_ingest_client-2025.10.7.dev20251007.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
53
- nv_ingest_client-2025.10.7.dev20251007.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
54
- nv_ingest_client-2025.10.7.dev20251007.dist-info/RECORD,,
50
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
51
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/METADATA,sha256=iWTJhO2KwEvrksJGTHektUTFkhPh5zjgxoTI_9N1lyo,30626
52
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
53
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/entry_points.txt,sha256=3uQVZkTZIjO08_bjTV-g0CDF5H1nrP1zWXU9gJOweuI,137
54
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/top_level.txt,sha256=1eMhBFD3SiWmpXnod2LM66C1HrSLSk96ninZi5XX-cE,17
55
+ nv_ingest_client-2025.10.8.dev20251008.dist-info/RECORD,,