nv-ingest-client 2025.8.11.dev20250811__tar.gz → 2025.8.13.dev20250813__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (61) hide show
  1. {nv_ingest_client-2025.8.11.dev20250811/src/nv_ingest_client.egg-info → nv_ingest_client-2025.8.13.dev20250813}/PKG-INFO +1 -1
  2. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/vdb/milvus.py +46 -12
  3. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  4. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/LICENSE +0 -0
  5. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/MANIFEST.in +0 -0
  6. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/README.md +0 -0
  7. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/pyproject.toml +0 -0
  8. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/setup.cfg +0 -0
  9. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/__init__.py +0 -0
  10. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/__init__.py +0 -0
  11. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  12. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/util/click.py +0 -0
  13. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/util/processing.py +0 -0
  14. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/util/system.py +0 -0
  15. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/cli/util/tasks.py +0 -0
  16. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/client/__init__.py +0 -0
  17. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/client/client.py +0 -0
  18. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/client/interface.py +0 -0
  19. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/client/util/processing.py +0 -0
  20. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  21. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/__init__.py +0 -0
  22. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/exceptions.py +0 -0
  23. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  24. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/jobs/job_spec.py +0 -0
  25. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  26. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  27. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  28. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  29. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  30. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  31. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/embed.py +0 -0
  32. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  33. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/filter.py +0 -0
  34. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  35. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  36. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  37. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  38. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  39. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  40. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/transform.py +0 -0
  41. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  42. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/__init__.py +0 -0
  43. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/dataset.py +0 -0
  44. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  45. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/file_processing/extract.py +0 -0
  46. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/milvus.py +0 -0
  47. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/process_json_files.py +0 -0
  48. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/processing.py +0 -0
  49. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/system.py +0 -0
  50. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/transport.py +0 -0
  51. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/util.py +0 -0
  52. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  53. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  54. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  55. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client/util/zipkin.py +0 -0
  56. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client.egg-info/SOURCES.txt +0 -0
  57. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  58. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  59. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  60. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  61. {nv_ingest_client-2025.8.11.dev20250811 → nv_ingest_client-2025.8.13.dev20250813}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.11.dev20250811
3
+ Version: 2025.8.13.dev20250813
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -1,23 +1,28 @@
1
+ import ast
2
+ import copy
1
3
  import datetime
4
+ import json
2
5
  import logging
6
+ import os
3
7
  import time
8
+ from functools import partial
9
+ from pathlib import Path
4
10
  from typing import Dict
5
11
  from typing import List
6
12
  from typing import Tuple
7
13
  from typing import Union
8
14
  from urllib.parse import urlparse
9
- from pathlib import Path
10
- import pandas as pd
11
- from functools import partial
12
- import json
13
- import os
14
- import numpy as np
15
- import ast
16
- import copy
17
15
 
16
+ import numpy as np
17
+ import pandas as pd
18
18
  import requests
19
+ from minio import Minio
20
+ from minio.commonconfig import CopySource
21
+ from minio.deleteobjects import DeleteObject
19
22
  from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
23
+ from nv_ingest_client.util.transport import infer_microservice
20
24
  from nv_ingest_client.util.util import ClientConfigSchema
25
+ from nv_ingest_client.util.vdb.adt_vdb import VDB
21
26
  from pymilvus import AnnSearchRequest
22
27
  from pymilvus import BulkInsertState
23
28
  from pymilvus import Collection
@@ -36,13 +41,11 @@ from pymilvus.model.sparse import BM25EmbeddingFunction
36
41
  from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
37
42
  from pymilvus.orm.types import CONSISTENCY_BOUNDED
38
43
  from scipy.sparse import csr_array
39
- from nv_ingest_client.util.transport import infer_microservice
40
- from nv_ingest_client.util.vdb.adt_vdb import VDB
41
-
42
44
 
43
45
  logger = logging.getLogger(__name__)
44
46
 
45
47
  CONSISTENCY = CONSISTENCY_BOUNDED
48
+ MINIO_DEFAULT_BUCKET_NAME = "a-bucket"
46
49
 
47
50
  pandas_reader_map = {
48
51
  ".json": pd.read_json,
@@ -737,6 +740,10 @@ def bulk_insert_milvus(
737
740
  collection_name: str,
738
741
  writer: RemoteBulkWriter,
739
742
  milvus_uri: str = "http://localhost:19530",
743
+ minio_endpoint: str = "localhost:9000",
744
+ access_key: str = "minioadmin",
745
+ secret_key: str = "minioadmin",
746
+ bucket_name: str = "nv-ingest",
740
747
  ):
741
748
  """
742
749
  This function initialize the bulk ingest of all minio uploaded records, and checks for
@@ -754,11 +761,22 @@ def bulk_insert_milvus(
754
761
  Milvus address with http(s) preffix and port. Can also be a file path, to activate
755
762
  milvus-lite.
756
763
  """
764
+ minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
757
765
 
758
766
  connections.connect(uri=milvus_uri)
759
767
  t_bulk_start = time.time()
760
768
  task_ids = []
769
+ uploaded_files = []
761
770
  for files in writer.batch_files:
771
+ for f in files:
772
+ # Hack: do_bulk_insert only reads from the default bucket ('a-bucket'),
773
+ # so we first copy objects from the source bucket into 'a-bucket' before inserting.
774
+ try:
775
+ minio_client.copy_object(MINIO_DEFAULT_BUCKET_NAME, f, CopySource(bucket_name, f))
776
+ uploaded_files.append(f)
777
+ except Exception as e:
778
+ logger.error(f"Error copying {f} from {bucket_name} to {MINIO_DEFAULT_BUCKET_NAME}: {e}")
779
+
762
780
  task_id = utility.do_bulk_insert(
763
781
  collection_name=collection_name,
764
782
  files=files,
@@ -780,6 +798,10 @@ def bulk_insert_milvus(
780
798
  logger.error(f"Task: {task_id}")
781
799
  logger.error(f"Failed reason: {task.failed_reason}")
782
800
  task_ids.remove(task_id)
801
+
802
+ # Cleanup: remove the copied files to undo the temporary workaround before bulk insert.
803
+ minio_client.remove_objects(MINIO_DEFAULT_BUCKET_NAME, [DeleteObject(f) for f in uploaded_files])
804
+
783
805
  t_bulk_end = time.time()
784
806
  logger.info(f"Bulk {collection_name} upload took {t_bulk_end - t_bulk_start} s")
785
807
 
@@ -986,6 +1008,10 @@ def write_to_nvingest_collection(
986
1008
  collection_name,
987
1009
  )
988
1010
  else:
1011
+ minio_client = Minio(minio_endpoint, access_key=access_key, secret_key=secret_key, secure=False)
1012
+ if not minio_client.bucket_exists(bucket_name):
1013
+ minio_client.make_bucket(bucket_name)
1014
+
989
1015
  # Connections parameters to access the remote bucket
990
1016
  conn = RemoteBulkWriter.S3ConnectParam(
991
1017
  endpoint=minio_endpoint, # the default MinIO service started along with Milvus
@@ -1004,7 +1030,15 @@ def write_to_nvingest_collection(
1004
1030
  cleaned_records,
1005
1031
  text_writer,
1006
1032
  )
1007
- bulk_insert_milvus(collection_name, writer, milvus_uri)
1033
+ bulk_insert_milvus(
1034
+ collection_name,
1035
+ writer,
1036
+ milvus_uri,
1037
+ minio_endpoint,
1038
+ access_key,
1039
+ secret_key,
1040
+ bucket_name,
1041
+ )
1008
1042
  # fixes bulk insert lag time https://github.com/milvus-io/milvus/issues/21746
1009
1043
  client.refresh_load(collection_name)
1010
1044
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.8.11.dev20250811
3
+ Version: 2025.8.13.dev20250813
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License