nv-ingest-api 2025.5.13.dev20250513__py3-none-any.whl → 2025.5.15.dev20250515__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (26) hide show
  1. nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
  2. nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
  3. nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
  4. nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
  5. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +1 -1
  6. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
  7. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +5 -8
  8. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +1 -1
  9. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +1 -1
  10. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +1 -1
  11. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +26 -12
  12. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +34 -23
  13. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +11 -10
  14. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +9 -7
  15. nv_ingest_api/internal/store/image_upload.py +1 -0
  16. nv_ingest_api/util/__init__.py +3 -0
  17. nv_ingest_api/util/image_processing/processing.py +1 -1
  18. nv_ingest_api/util/pdf/pdfium.py +1 -1
  19. nv_ingest_api/util/schema/__init__.py +3 -0
  20. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  21. nv_ingest_api/util/system/hardware_info.py +4 -0
  22. {nv_ingest_api-2025.5.13.dev20250513.dist-info → nv_ingest_api-2025.5.15.dev20250515.dist-info}/METADATA +1 -1
  23. {nv_ingest_api-2025.5.13.dev20250513.dist-info → nv_ingest_api-2025.5.15.dev20250515.dist-info}/RECORD +26 -26
  24. {nv_ingest_api-2025.5.13.dev20250513.dist-info → nv_ingest_api-2025.5.15.dev20250515.dist-info}/WHEEL +1 -1
  25. {nv_ingest_api-2025.5.13.dev20250513.dist-info → nv_ingest_api-2025.5.15.dev20250515.dist-info}/licenses/LICENSE +0 -0
  26. {nv_ingest_api-2025.5.13.dev20250513.dist-info → nv_ingest_api-2025.5.15.dev20250515.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
27
27
  PADDLE_MIN_WIDTH = 32
28
28
  PADDLE_MIN_HEIGHT = 32
29
29
 
30
- logger = logging.getLogger(f"morpheus.{__name__}")
30
+ logger = logging.getLogger(f"ray.{__name__}")
31
31
 
32
32
 
33
33
  def _filter_valid_chart_images(
@@ -80,7 +80,7 @@ def _run_chart_inference(
80
80
  yolox_client.infer,
81
81
  data=data_yolox,
82
82
  model_name="yolox",
83
- stage_name="chart_data_extraction",
83
+ stage_name="chart_extraction",
84
84
  max_batch_size=8,
85
85
  trace_info=trace_info,
86
86
  )
@@ -88,7 +88,7 @@ def _run_chart_inference(
88
88
  paddle_client.infer,
89
89
  data=data_paddle,
90
90
  model_name="paddle",
91
- stage_name="chart_data_extraction",
91
+ stage_name="chart_extraction",
92
92
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
93
93
  trace_info=trace_info,
94
94
  )
@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
223
223
  model_name="yolox",
224
224
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
225
225
  trace_info=trace_info,
226
- stage_name="pdf_content_extractor",
226
+ stage_name="pdf_extraction",
227
227
  )
228
228
 
229
229
  # Process each result along with its corresponding image.
@@ -100,7 +100,7 @@ def _update_infographic_metadata(
100
100
  paddle_results = paddle_client.infer(
101
101
  data=data_paddle,
102
102
  model_name="paddle",
103
- stage_name="infographic_data_extraction",
103
+ stage_name="infographic_extraction",
104
104
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
105
105
  trace_info=trace_info,
106
106
  )
@@ -81,7 +81,7 @@ def _run_inference(
81
81
  yolox_client.infer,
82
82
  data=data_yolox,
83
83
  model_name="yolox",
84
- stage_name="table_data_extraction",
84
+ stage_name="table_extraction",
85
85
  max_batch_size=8,
86
86
  trace_info=trace_info,
87
87
  )
@@ -89,7 +89,7 @@ def _run_inference(
89
89
  paddle_client.infer,
90
90
  data=data_paddle,
91
91
  model_name="paddle",
92
- stage_name="table_data_extraction",
92
+ stage_name="table_extraction",
93
93
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
94
94
  trace_info=trace_info,
95
95
  )
@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
466
466
  inference_results = nemoretriever_parse_client.infer(
467
467
  data=data,
468
468
  model_name="nemoretriever_parse",
469
- stage_name="pdf_content_extractor",
469
+ stage_name="pdf_extraction",
470
470
  max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
471
471
  execution_trace_log=execution_trace_log,
472
472
  )
@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
105
105
  model_name="yolox",
106
106
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
107
  trace_info=execution_trace_log,
108
- stage_name="pdf_content_extractor",
108
+ stage_name="pdf_extraction",
109
109
  )
110
110
 
111
111
  # Process results: iterate over each image's inference output.
@@ -99,14 +99,11 @@ def _decode_and_extract_from_pptx(
99
99
 
100
100
  # Retrieve extraction parameters (and remove boolean flags as they are consumed).
101
101
  extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
102
- try:
103
- extract_text: bool = extract_params.pop("extract_text", False)
104
- extract_images: bool = extract_params.pop("extract_images", False)
105
- extract_tables: bool = extract_params.pop("extract_tables", False)
106
- extract_charts: bool = extract_params.pop("extract_charts", False)
107
- extract_infographics: bool = extract_params.pop("extract_infographics", False)
108
- except KeyError as e:
109
- raise ValueError(f"Missing required extraction flag: {e}")
102
+ extract_text: bool = extract_params.pop("extract_text", False)
103
+ extract_images: bool = extract_params.pop("extract_images", False)
104
+ extract_tables: bool = extract_params.pop("extract_tables", False)
105
+ extract_charts: bool = extract_params.pop("extract_charts", False)
106
+ extract_infographics: bool = extract_params.pop("extract_infographics", False)
110
107
 
111
108
  # Inject additional configuration and trace information.
112
109
  if getattr(extraction_config, "pptx_extraction_config", None) is not None:
@@ -129,7 +129,7 @@ class ChartExtractorSchema(BaseModel):
129
129
  @field_validator("max_queue_size", "n_workers")
130
130
  def check_positive(cls, v, field):
131
131
  if v <= 0:
132
- raise ValueError(f"{field.field_name} must be greater than 10.")
132
+ raise ValueError(f"{field.field_name} must be greater than 0.")
133
133
  return v
134
134
 
135
135
  model_config = ConfigDict(extra="forbid")
@@ -122,7 +122,7 @@ class InfographicExtractorSchema(BaseModel):
122
122
  @field_validator("max_queue_size", "n_workers")
123
123
  def check_positive(cls, v, field):
124
124
  if v <= 0:
125
- raise ValueError(f"{field.field_name} must be greater than 10.")
125
+ raise ValueError(f"{field.field_name} must be greater than 0.")
126
126
  return v
127
127
 
128
128
  model_config = ConfigDict(extra="forbid")
@@ -122,7 +122,7 @@ class TableExtractorSchema(BaseModel):
122
122
  @field_validator("max_queue_size", "n_workers")
123
123
  def check_positive(cls, v, field):
124
124
  if v <= 0:
125
- raise ValueError(f"{field.field_name} must be greater than 10.")
125
+ raise ValueError(f"{field.field_name} must be greater than 0.")
126
126
  return v
127
127
 
128
128
  endpoint_config: Optional[TableExtractorConfigSchema] = None
@@ -2,22 +2,36 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ from pydantic import BaseModel, Field
6
+ from typing import Optional, Literal, Annotated
5
7
 
6
- from typing import Optional, Literal
7
8
 
8
- from pydantic import Field, BaseModel
9
- from typing_extensions import Annotated
9
+ class MessageBrokerClientSchema(BaseModel):
10
+ """
11
+ Configuration schema for message broker client connections.
12
+ Supports Redis or simple in-memory clients.
13
+ """
10
14
 
15
+ host: str = Field(default="redis", description="Hostname of the broker service.")
11
16
 
12
- class MessageBrokerClientSchema(BaseModel):
13
- host: str = "redis"
14
- port: Annotated[int, Field(gt=0, lt=65536)] = 6379
17
+ port: Annotated[int, Field(gt=0, lt=65536)] = Field(
18
+ default=6379, description="Port to connect to. Must be between 1 and 65535."
19
+ )
20
+
21
+ client_type: Literal["redis", "simple"] = Field(
22
+ default="redis", description="Type of broker client. Supported values: 'redis', 'simple'."
23
+ )
24
+
25
+ broker_params: Optional[dict] = Field(
26
+ default_factory=dict, description="Optional parameters passed to the broker client."
27
+ )
15
28
 
16
- # Update this for new broker types
17
- client_type: Literal["redis", "simple"] = "redis" # Restrict to 'redis' or 'simple'
29
+ connection_timeout: Annotated[int, Field(ge=0)] = Field(
30
+ default=300, description="Connection timeout in seconds. Must be >= 0."
31
+ )
18
32
 
19
- broker_params: Optional[dict] = Field(default_factory=dict)
33
+ max_backoff: Annotated[int, Field(ge=0)] = Field(
34
+ default=300, description="Maximum backoff time in seconds. Must be >= 0."
35
+ )
20
36
 
21
- connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300
22
- max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300
23
- max_retries: Optional[Annotated[int, Field(ge=0)]] = 0
37
+ max_retries: Annotated[int, Field(ge=0)] = Field(default=0, description="Maximum number of retries. Must be >= 0.")
@@ -160,29 +160,40 @@ class IngestTaskSchema(BaseModelNoExt):
160
160
  @model_validator(mode="before")
161
161
  @classmethod
162
162
  def check_task_properties_type(cls, values):
163
- task_type, task_properties = values.get("type"), values.get("task_properties", {})
164
- if task_type and task_properties:
165
- expected_type = {
166
- TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
167
- TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
168
- TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
169
- TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
170
- TaskTypeEnum.FILTER: IngestTaskFilterSchema, # Extend mapping as necessary
171
- TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
172
- TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
173
- TaskTypeEnum.STORE: IngestTaskStoreSchema,
174
- TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
175
- TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
176
- TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
177
- TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
178
- TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
179
- }.get(
180
- task_type
181
- ) # Removed .upper()
182
-
183
- # Validate task_properties against the expected schema.
184
- validated_task_properties = expected_type(**task_properties)
185
- values["task_properties"] = validated_task_properties
163
+ task_type = values.get("type")
164
+ task_properties = values.get("task_properties", {})
165
+
166
+ # Ensure task_type is lowercased and converted to enum early
167
+ if isinstance(task_type, str):
168
+ task_type = task_type.lower()
169
+ try:
170
+ task_type = TaskTypeEnum(task_type)
171
+ except ValueError:
172
+ raise ValueError(f"{task_type} is not a valid TaskTypeEnum value")
173
+
174
+ task_type_to_schema = {
175
+ TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
176
+ TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
177
+ TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
178
+ TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
179
+ TaskTypeEnum.FILTER: IngestTaskFilterSchema,
180
+ TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
181
+ TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
182
+ TaskTypeEnum.STORE: IngestTaskStoreSchema,
183
+ TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
184
+ TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
185
+ TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
186
+ TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
187
+ TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
188
+ }
189
+
190
+ expected_schema_cls = task_type_to_schema.get(task_type)
191
+ if expected_schema_cls is None:
192
+ raise ValueError(f"Unsupported or missing task_type '{task_type}'")
193
+
194
+ validated_task_properties = expected_schema_cls(**task_properties)
195
+ values["type"] = task_type # ensure type is now always the enum
196
+ values["task_properties"] = validated_task_properties
186
197
  return values
187
198
 
188
199
  @field_validator("type", mode="before")
@@ -5,7 +5,7 @@
5
5
 
6
6
  import logging
7
7
 
8
- from pydantic import ConfigDict, BaseModel
8
+ from pydantic import ConfigDict, BaseModel, Field
9
9
 
10
10
  from nv_ingest_api.util.logging.configuration import LogLevel
11
11
 
@@ -13,13 +13,14 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  class TextEmbeddingSchema(BaseModel):
16
- api_key: str = "api_key"
17
- batch_size: int = 4
18
- embedding_model: str = "nvidia/nv-embedqa-e5-v5"
19
- embedding_nim_endpoint: str = "http://embedding:8000/v1"
20
- encoding_format: str = "float"
21
- httpx_log_level: LogLevel = LogLevel.WARNING
22
- input_type: str = "passage"
23
- raise_on_failure: bool = False
24
- truncate: str = "END"
16
+ api_key: str = Field(default="api_key")
17
+ batch_size: int = Field(default=4)
18
+ embedding_model: str = Field(default="nvidia/nv-embedqa-e5-v5")
19
+ embedding_nim_endpoint: str = Field(default="http://embedding:8000/v1")
20
+ encoding_format: str = Field(default="float")
21
+ httpx_log_level: LogLevel = Field(default=LogLevel.WARNING)
22
+ input_type: str = Field(default="passage")
23
+ raise_on_failure: bool = Field(default=False)
24
+ truncate: str = Field(default="END")
25
+
25
26
  model_config = ConfigDict(extra="forbid")
@@ -2,21 +2,23 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- from pydantic import Field, BaseModel, field_validator
5
+ from pydantic import Field, BaseModel, field_validator, ConfigDict
6
6
 
7
7
  from typing import Optional
8
8
 
9
- from typing_extensions import Annotated
10
-
11
9
 
12
10
  class TextSplitterSchema(BaseModel):
13
11
  tokenizer: Optional[str] = None
14
- chunk_size: Annotated[int, Field(gt=0)] = 1024
15
- chunk_overlap: Annotated[int, Field(ge=0)] = 150
12
+ chunk_size: int = Field(default=1024, gt=0)
13
+ chunk_overlap: int = Field(default=150, ge=0)
16
14
  raise_on_failure: bool = False
17
15
 
18
16
  @field_validator("chunk_overlap")
19
- def check_chunk_overlap(cls, v, values, **kwargs):
20
- if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
17
+ @classmethod
18
+ def check_chunk_overlap(cls, v, values):
19
+ chunk_size = values.data.get("chunk_size")
20
+ if chunk_size is not None and v >= chunk_size:
21
21
  raise ValueError("chunk_overlap must be less than chunk_size")
22
22
  return v
23
+
24
+ model_config = ConfigDict(extra="forbid")
@@ -116,6 +116,7 @@ def _upload_images_to_minio(df: pd.DataFrame, params: Dict[str, Any]) -> pd.Data
116
116
  if "content" not in metadata:
117
117
  logger.error("Row %s: missing 'content' in metadata", idx)
118
118
  continue
119
+
119
120
  if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
120
121
  logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
121
122
  continue
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -150,7 +150,7 @@ def extract_tables_and_charts_yolox(
150
150
  min_score=YOLOX_MIN_SCORE,
151
151
  final_thresh=YOLOX_FINAL_SCORE,
152
152
  trace_info=trace_info,
153
- stage_name="pdf_content_extractor",
153
+ stage_name="pdf_extraction",
154
154
  )
155
155
 
156
156
  # Process results: iterate over each image's inference output.
@@ -119,7 +119,7 @@ def pdfium_try_get_bitmap_as_numpy(image_obj) -> np.ndarray:
119
119
  return img_array
120
120
 
121
121
 
122
- @traceable_func(trace_name="pdf_content_extractor::pdfium_pages_to_numpy")
122
+ @traceable_func(trace_name="pdf_extraction::pdfium_pages_to_numpy")
123
123
  def pdfium_pages_to_numpy(
124
124
  pages: List[pdfium.PdfPage],
125
125
  render_dpi: int = 300,
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -45,6 +45,10 @@ class SystemResourceProbe:
45
45
  A value of 0.5 suggests a hyperthread adds 50% extra performance.
46
46
  Requires psutil to be installed and report physical cores.
47
47
  Defaults to 0.75.
48
+
49
+ Note: the default value of 0.75 is a heuristic and may not be optimal
50
+ for all situations. It is where parallel pdf decomposition efficiency
51
+ is observed to begin rolling off.
48
52
  """
49
53
  if not (0.0 <= hyperthread_weight <= 1.0):
50
54
  raise ValueError("hyperthread_weight must be between 0.0 and 1.0")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.13.dev20250513
3
+ Version: 2025.5.15.dev20250515
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -18,24 +18,24 @@ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py,sha25
18
18
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py,sha256=1wkciAxu8lz9WuPuoleJFy2s09ieSzXl1S71F9r0BWA,4385
19
19
  nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha256=CM2yV8lfEw1F1ORAjupD4gyIKX0PDDJrL3nsZ5Mnrgg,31539
20
20
  nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
- nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=Jy_fNmDbZcdni55Fq7vT6NdbYnCyGoyw0J7QjpK-KPc,13315
21
+ nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=CkaW8ihPmGMQGrZh0ih14gtEpWuGOJ8InPQfZwpsP2g,13300
22
22
  nv_ingest_api/internal/extract/image/image_extractor.py,sha256=4tUWinuFMN3ukWa2tZa2_LtzRiTyUAUCBF6BDkUEvm0,8705
23
- nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=k4Z6JwsoNKsyfmpaQkN_dxJpAv9-RVsRL1BfSWUtXTM,8908
24
- nv_ingest_api/internal/extract/image/table_extractor.py,sha256=80FQef4Dsn6__MNIRCQzFf32s4wUyTOzBFgmA84JZJk,13133
23
+ nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=yc9b2q_Ea08CEVclZ47UkpU4F7qlakPuU3UV9P013W0,8903
24
+ nv_ingest_api/internal/extract/image/table_extractor.py,sha256=ivHaJxYjeHvFM1PZIpxVabPadxtcTsu51j398ZjMhD4,13123
25
25
  nv_ingest_api/internal/extract/image/image_helpers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
26
- nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=NU8TEU9p2aIL_KppyhtTgRUPqD4MsanxATG19rKhGjw,15032
26
+ nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=P8rcl4YPyeWeMJg7u1yejD3k9EnDVEbJgfYEnJ4WO5c,15025
27
27
  nv_ingest_api/internal/extract/pdf/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
28
28
  nv_ingest_api/internal/extract/pdf/pdf_extractor.py,sha256=CxtWaD6mql9MEqSdk2CfSQ9T-Bn87beBkCOuGGjxGt8,2934
29
29
  nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIRiozIRw70Kybw3A72-lcKFeoTI,582
30
30
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
31
31
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=PpKTqS8jGHBV6mKLGZWwjpfT8ga6Fy8ffrvL-gPAf2c,8182
32
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=6xI8RKceIUqZvByLm433nWzdJUoAOWvqRpTX0Fy-8lg,22933
33
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=jUcquCWbyQPNCHZLaV-XnVqUFsajX4YxVFCiWWwD4QQ,22367
32
+ nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=Uqj1NH7yWga9P6_vCzgny1WKALfF--UdAaGHUF8K_aQ,22926
33
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=fDbrZwJ-lgeHYOq107WXehzdSvyF8zEDza_9UkDm5aE,22360
34
34
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
35
35
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
36
36
  nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=Jk3wrQ2CZs167juvEZ-uV6qXWQjR08hhIu8otk2MWj4,4931
37
37
  nv_ingest_api/internal/extract/pptx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
38
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=vTGWaR0YXG4gLM0lYXFQ83F2nlU__mmOmXsA0jYlZ70,7871
38
+ nv_ingest_api/internal/extract/pptx/pptx_extractor.py,sha256=o-0P2dDyRFW37uQi_lKk6-eFozTcZvbq-2Y4I0EBMIY,7749
39
39
  nv_ingest_api/internal/extract/pptx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py,sha256=Lg2I1Zq-WJagsZibgyn__8T-M86BjkqAiXWNta9X_EU,29430
41
41
  nv_ingest_api/internal/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -66,20 +66,20 @@ nv_ingest_api/internal/primitives/tracing/tagging.py,sha256=O5dD7Z7j43nrjqn0Axhx
66
66
  nv_ingest_api/internal/schemas/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
67
67
  nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
68
68
  nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=VVppZgV1lnyJCTfADexzoj3V0lOSq3t6Dw_6VhIxZ7k,3771
69
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=mNsv628oslNieU6KPUHw_Iwr4WohtK2dIHoVo2HnaEs,4302
69
+ nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=iu8lHQC0zbBB9VRK7PZisAVzpeSpFqjcXRAnwZ9OzoM,4301
70
70
  nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=M2N7WjMNvSemHcJHWeNUD_kFG0wC5VE2W3K6SVrJqvA,3761
71
71
  nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=GC4xV8Z9TPLOuxlEtf2fbklSSp8ETGMrDpZgMQ02UwA,3766
72
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=_ptTrxN74tpasJ0aQZgaXEUYFe298PJGbGNk6gyeM94,3992
72
+ nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=rl_hFDoJaJLTKbtnEpDSBj-73KQL9aUEVKGiW0IdXiU,3991
73
73
  nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=G9g1lEORmryUWTzDyZ0vHAuPnVMK7VaRx0E4xzmAw3Q,6589
74
74
  nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py,sha256=5dT0kv-Mmpe5KW-BZc1JOW3rUlgzVZI0rpB79NWytmw,3761
75
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=SXBYDU3V97-pPOLfhFmXQveP_awARXP7k1aGcMMEJtU,3951
75
+ nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=sbt3TvQrLsXc8-muKnsyOs4MfpA4VzrprYHdu1IrY8M,3950
76
76
  nv_ingest_api/internal/schemas/message_brokers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
77
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py,sha256=nbnNzCQCCduoFw4k8XPfkpn3jyyMRpDLROTwEosaSG8,766
77
+ nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py,sha256=4xTSFE_vH7yZE9RRJRflFAG9hNXIaF6K020M_xA7ylw,1351
78
78
  nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDxTamVFqTQs2Yd8uvWyPE5mddHAWSU4PtfEIQ,966
79
79
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
80
80
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
81
81
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
82
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=nn2cExuU9AmrOMGFMN1RmBzzLaPtLMxs0cJai-cu9w8,7753
82
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=xdxwfXFjXbHn3yu_7oxELQO6h5udEXViAWPSst8QpTU,8093
83
83
  nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=_FAE-yeb01hxq05SXrV3NLM4DPUPSfnIbH6ZMliWsEg,6625
84
84
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
85
85
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -89,16 +89,16 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
89
89
  nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
90
90
  nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=xLxXJsm8QeaL7KPe7m5sP2rd_AuNRMX29rdeVdoei3Y,582
91
91
  nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
92
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=vlTjAj1T78QkQXYkC83vZQKTW04x7PeoukEzmkam7sY,732
93
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=iM1sUklcZVA6fdeEWRsMqV_ls-E4UcUsGwewv0JJRi4,759
92
+ nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=vB7g5sJ3r3g7xxGWPbSECqhTdPCz9-T-Ng3iGy7N5x8,875
93
+ nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
94
94
  nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
95
95
  nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
96
- nv_ingest_api/internal/store/image_upload.py,sha256=J5EHNng7Z5I6M4f3UcbniKQB29Scr3Qe05wsBpaVXds,9653
96
+ nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
97
97
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest_api/internal/transform/caption_image.py,sha256=RYL_b26zfaRlbHz0XvLw9HwaMlXpNhr7gayjxGzdALQ,8545
99
99
  nv_ingest_api/internal/transform/embed_text.py,sha256=F8kg-WXihtuUMwDQUUYjnfGDCdQp1Mkd-jeThOiJT0s,16507
100
100
  nv_ingest_api/internal/transform/split_text.py,sha256=y6NYRkCEVpVsDu-AqrKx2D6JPp1vwxclw9obNZNJIIs,6561
101
- nv_ingest_api/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
+ nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
102
102
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  nv_ingest_api/util/control_message/validators.py,sha256=KvvbyheJ5rbzvJbH9JKpMR9VfoI0b0uM6eTAZte1p44,1315
104
104
  nv_ingest_api/util/converters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -118,7 +118,7 @@ nv_ingest_api/util/exception_handlers/pdf.py,sha256=FUC41QJKDCfiTv-1c1_8Isxwt1xM
118
118
  nv_ingest_api/util/exception_handlers/schemas.py,sha256=NJngVNf9sk5Uz6CFFfkNO_LBAMt2QZUcMYGxX64oYRk,2179
119
119
  nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFIsnjhZi6Ku3JJhVLimA,104
120
120
  nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
121
- nv_ingest_api/util/image_processing/processing.py,sha256=dHyoxoI2btKT04ODJK0ChB8MR6eCnZ0ZLpbEQowCb5A,6561
121
+ nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
122
122
  nv_ingest_api/util/image_processing/table_and_chart.py,sha256=bxOu9PZYkG_WFCDGw_JLaO60S2pDSN8EOWK3xkIwr2A,14376
123
123
  nv_ingest_api/util/image_processing/transforms.py,sha256=Kz9hrizV314Hy7cRCYK9ZmhmBbVUOZ_z0HEpzZYcslQ,14081
124
124
  nv_ingest_api/util/logging/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -134,21 +134,21 @@ nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ
134
134
  nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
135
135
  nv_ingest_api/util/nim/__init__.py,sha256=UqbiXFCqjWcjNvoduXd_0gOUOGBT8JvppiYHOmMyneA,1775
136
136
  nv_ingest_api/util/pdf/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
137
- nv_ingest_api/util/pdf/pdfium.py,sha256=1Py9qj-dWHDisuSc50iHGRSaY4QlyfRKYTwz99nskZI,15768
138
- nv_ingest_api/util/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
+ nv_ingest_api/util/pdf/pdfium.py,sha256=Ch9Gh5jRLcBr3stjCckqWwTUL-T0sI50PlQnZHo_9NA,15761
138
+ nv_ingest_api/util/schema/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
139
139
  nv_ingest_api/util/schema/schema_validator.py,sha256=H0yZ_i_HZaiBRUCGmTBfRB9-hURhVqyd10aS_ynM1_0,321
140
140
  nv_ingest_api/util/service_clients/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
141
141
  nv_ingest_api/util/service_clients/client_base.py,sha256=eCOeq3Rr6Xnnsh-oHszYlQTOffQyzsT8s43V4V8H_h8,2716
142
142
  nv_ingest_api/util/service_clients/kafka/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
143
- nv_ingest_api/util/service_clients/redis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
+ nv_ingest_api/util/service_clients/redis/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
144
144
  nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=3NLecvIvVN1v-sA7d7G-_f6qJVZyfJE2H8Iu5KG3Aew,37417
145
145
  nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFSNTf7psoOpLREiLN5ezpHFW0HI,21732
147
147
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
148
148
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
- nv_ingest_api/util/system/hardware_info.py,sha256=JGxBbF3kvgYbwhhWvtjNzPxVZQV_npmsordAioBrglo,19252
150
- nv_ingest_api-2025.5.13.dev20250513.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
151
- nv_ingest_api-2025.5.13.dev20250513.dist-info/METADATA,sha256=SnzH1EvOcXSwFdwNfRNQsNfEUbYekGLBHgCugs9UNQ8,13889
152
- nv_ingest_api-2025.5.13.dev20250513.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
153
- nv_ingest_api-2025.5.13.dev20250513.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
154
- nv_ingest_api-2025.5.13.dev20250513.dist-info/RECORD,,
149
+ nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
150
+ nv_ingest_api-2025.5.15.dev20250515.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
151
+ nv_ingest_api-2025.5.15.dev20250515.dist-info/METADATA,sha256=-TkJaHgTLZNZ1V-lETd5cefk6zp73lIALM3kySY8Kmg,13889
152
+ nv_ingest_api-2025.5.15.dev20250515.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
153
+ nv_ingest_api-2025.5.15.dev20250515.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
154
+ nv_ingest_api-2025.5.15.dev20250515.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.4.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5