nv-ingest 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ router = APIRouter()
18
18
 
19
19
  # List of ALL of the HTTP environment variable endpoints that should be checked
20
20
  READY_CHECK_ENV_VAR_MAP = {
21
- "paddle": "PADDLE_HTTP_ENDPOINT",
21
+ "ocr": "OCR_HTTP_ENDPOINT",
22
22
  "yolox_graphic_elements": "YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
23
23
  "yolox_page_elements": "YOLOX_HTTP_ENDPOINT",
24
24
  "yolox_table_structure": "YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT",
@@ -147,8 +147,9 @@ if __name__ == "__main__":
147
147
  os.environ["YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT"] = "127.0.0.1:8004"
148
148
  os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = "http://localhost:8003/v1/infer"
149
149
  os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
150
- os.environ["PADDLE_GRPC_ENDPOINT"] = "localhost:8010"
151
- os.environ["PADDLE_INFER_PROTOCOL"] = "grpc"
150
+ os.environ["OCR_GRPC_ENDPOINT"] = "localhost:8010"
151
+ os.environ["OCR_INFER_PROTOCOL"] = "grpc"
152
+ os.environ["OCR_MODEL_NAME"] = "paddle"
152
153
  os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
153
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
155
  os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
@@ -172,7 +173,7 @@ if __name__ == "__main__":
172
173
  nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
173
174
  get_nim_service("nemoretriever_parse")
174
175
  )
175
- paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
176
+ ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
176
177
 
177
178
  model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
178
179
  pdf_extractor_config = {
@@ -201,8 +202,8 @@ if __name__ == "__main__":
201
202
  "endpoint_config": {
202
203
  "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
203
204
  "yolox_infer_protocol": yolox_graphic_elements_protocol,
204
- "paddle_endpoints": (paddle_grpc, paddle_http),
205
- "paddle_infer_protocol": paddle_protocol,
205
+ "ocr_endpoints": (ocr_grpc, ocr_http),
206
+ "ocr_infer_protocol": ocr_protocol,
206
207
  "auth_token": yolox_auth,
207
208
  }
208
209
  }
@@ -210,8 +211,8 @@ if __name__ == "__main__":
210
211
  "endpoint_config": {
211
212
  "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
212
213
  "yolox_infer_protocol": yolox_table_structure_protocol,
213
- "paddle_endpoints": (paddle_grpc, paddle_http),
214
- "paddle_infer_protocol": paddle_protocol,
214
+ "ocr_endpoints": (ocr_grpc, ocr_http),
215
+ "ocr_infer_protocol": ocr_protocol,
215
216
  "auth_token": yolox_auth,
216
217
  }
217
218
  }
@@ -269,8 +269,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
269
269
  self._logger.debug("Received message type: %s", type(job))
270
270
  if isinstance(job, BaseModel):
271
271
  self._logger.debug("Message is a BaseModel with response_code: %s", job.response_code)
272
- if job.response_code != 0:
273
- self._logger.debug("Message response_code != 0, returning None")
272
+ if job.response_code not in (0, 2):
273
+ self._logger.debug("Message received with unhandled response_code, returning None")
274
+ return None
275
+ if job.response_code == 2:
276
+ self._logger.debug("Message response_code == 2, returning None")
274
277
  return None
275
278
  job = json.loads(job.response)
276
279
  self._logger.debug("Successfully fetched message with job_id: %s", job.get("job_id", "unknown"))
@@ -78,8 +78,9 @@ class PipelineCreationSchema(BaseModel):
78
78
  otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
79
79
 
80
80
  # OCR settings
81
- paddle_http_endpoint: str = os.getenv("PADDLE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
82
- paddle_infer_protocol: str = os.getenv("PADDLE_INFER_PROTOCOL", "http")
81
+ ocr_http_endpoint: str = os.getenv("OCR_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
82
+ ocr_infer_protocol: str = os.getenv("OCR_INFER_PROTOCOL", "http")
83
+ ocr_model_name: str = os.getenv("OCR_MODEL_NAME", "paddle")
83
84
 
84
85
  # Task queue settings
85
86
  REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
@@ -223,15 +223,15 @@ def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_ext
223
223
  yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
224
224
  get_nim_service("yolox_table_structure")
225
225
  )
226
- paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
226
+ ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
227
227
 
228
228
  table_extractor_config = TableExtractorSchema(
229
229
  **{
230
230
  "endpoint_config": {
231
231
  "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
232
232
  "yolox_infer_protocol": yolox_table_structure_protocol,
233
- "paddle_endpoints": (paddle_grpc, paddle_http),
234
- "paddle_infer_protocol": paddle_protocol,
233
+ "ocr_endpoints": (ocr_grpc, ocr_http),
234
+ "ocr_infer_protocol": ocr_protocol,
235
235
  "auth_token": yolox_auth,
236
236
  }
237
237
  }
@@ -252,15 +252,15 @@ def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_ext
252
252
  yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
253
253
  get_nim_service("yolox_graphic_elements")
254
254
  )
255
- paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
255
+ ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
256
256
 
257
257
  chart_extractor_config = ChartExtractorSchema(
258
258
  **{
259
259
  "endpoint_config": {
260
260
  "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
261
261
  "yolox_infer_protocol": yolox_graphic_elements_protocol,
262
- "paddle_endpoints": (paddle_grpc, paddle_http),
263
- "paddle_infer_protocol": paddle_protocol,
262
+ "ocr_endpoints": (ocr_grpc, ocr_http),
263
+ "ocr_infer_protocol": ocr_protocol,
264
264
  "auth_token": yolox_auth,
265
265
  }
266
266
  }
@@ -278,14 +278,14 @@ def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_ext
278
278
 
279
279
 
280
280
  def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
281
- paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
281
+ ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
282
282
 
283
283
  infographic_content_extractor_config = InfographicExtractorSchema(
284
284
  **{
285
285
  "endpoint_config": {
286
- "paddle_endpoints": (paddle_grpc, paddle_http),
287
- "paddle_infer_protocol": paddle_protocol,
288
- "auth_token": paddle_auth,
286
+ "ocr_endpoints": (ocr_grpc, ocr_http),
287
+ "ocr_infer_protocol": ocr_protocol,
288
+ "auth_token": ocr_auth,
289
289
  }
290
290
  }
291
291
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.7.16.dev20250716
3
+ Version: 2025.7.17.dev20250717
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -3,7 +3,7 @@ nv_ingest/version.py,sha256=MG7DxlzpnoJI56vqxwzs9WeMAEI3uPhfDiNLs6GN6wI,986
3
3
  nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
4
  nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
5
5
  nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
6
- nv_ingest/api/v1/health.py,sha256=zqu-isMRjh4NveS4XWh5FaAZGPIlBVxpCOg3Uu8nUHQ,4746
6
+ nv_ingest/api/v1/health.py,sha256=pV-RoVq5y0iBPp0qZoLzd1xKpd0JiHAi0UMyMj99LqU,4740
7
7
  nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
8
8
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
9
9
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -14,7 +14,7 @@ nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py,sha256=PQliU_kyG
14
14
  nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py,sha256=VFii2yxJuikimOxie3edKq5JN06g78AF8bdHSHVX8p8,2677
15
15
  nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py,sha256=N6NH4KgZJ60e_JkGRcSmfQtX37qtX4TMcavOR-n3heE,2549
16
16
  nv_ingest/framework/orchestration/ray/examples/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
17
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=qUNvWiNBUEEzuCySY3usWmHQz9qMgTGVZuKmLWqTsi4,16412
17
+ nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=DufjmNm-05uTkq_Mz0QQB6fHw_Rl9eX3PRtnH4sntGs,16405
18
18
  nv_ingest/framework/orchestration/ray/examples/task_source_harness.py,sha256=Yt7uxThg7s8WuMiaHLKC8r1XAG7QixegfkT-juE5oNw,1953
19
19
  nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha256=XkvsoIzH5ftXvAZ4ox7mxbx7ESVx6D8Xupcwbqgd52w,3277
20
20
  nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -48,7 +48,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py,sha256=wQSlVx3T14
48
48
  nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py,sha256=0SQHJlFuXlP16YRWduX1fMKgjhUd7UhDAWQ8XZh4_0I,1471
49
49
  nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,sha256=enylryvcPmzirpOjCahqYJbNSLsNvv1KpMnOzGqNZQQ,11509
50
50
  nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
51
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=KzRil999sHGK4jV-EBU8LUuPp_e3W-Vc_feFEAvG2-E,20995
51
+ nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=b9ndnQBB1paR0iRe3NdzQ7BZ2S65LG2jbtjXvvDc_s4,21183
52
52
  nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
53
53
  nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=6NkwQzseAnaj0Ptpr3oKvab2EnJdMwTjI2p4dS_HzsI,3901
54
54
  nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=SMLHQElZkKldnjy0_VHIKS65DBAAtOhwhdoaFe1yb9I,3337
@@ -66,8 +66,8 @@ nv_ingest/framework/orchestration/ray/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EP
66
66
  nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
67
67
  nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=flRLS7yc5n6gheykayuL3prC7O-ZhcVY2s9Wc14SGWE,47377
68
68
  nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=d2-GS2tqk6JOFdw65CL1AwfjdUbkC_XxUuJH8Dy-aQ0,10456
69
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=5I7N-nmGXaYqyPEtPZbFhgvog2b2c3eagWc69naMc9s,14340
70
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=-IoaRYO7tDA9JXJ7J_j-8pVcU4dYWXrKDM3vo392XZA,21229
69
+ nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=dlz83vEFKcvrwsPFP1M0Md1lOYbuOVX2MeyCPq43RGg,14392
70
+ nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=rlMqLtuaI-VdVlAT_7-9HSIgX6YfsxWsBzKe3fBvYl0,21136
71
71
  nv_ingest/framework/orchestration/ray/util/pipeline/tools.py,sha256=LQVb8k9jURaxh2Ga44Js_XuYFCbeN4_nLgDmtExovQg,8026
72
72
  nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
73
73
  nv_ingest/framework/orchestration/ray/util/system_tools/memory.py,sha256=ICqY0LLB3hFTZk03iX5yffMSKFH2q_aQomtDVzS_mKw,2228
@@ -96,8 +96,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
96
96
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
97
97
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
99
- nv_ingest-2025.7.16.dev20250716.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
100
- nv_ingest-2025.7.16.dev20250716.dist-info/METADATA,sha256=PmbdHa6yAUfDi83pFlWjbUkrAr4Lf8hMHbVhNq5DBiY,15142
101
- nv_ingest-2025.7.16.dev20250716.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
102
- nv_ingest-2025.7.16.dev20250716.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
103
- nv_ingest-2025.7.16.dev20250716.dist-info/RECORD,,
99
+ nv_ingest-2025.7.17.dev20250717.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
100
+ nv_ingest-2025.7.17.dev20250717.dist-info/METADATA,sha256=Kn17uqN2NU6FWJzEU46y6-M_zZ8v-l3vFNjnP2BdF-U,15142
101
+ nv_ingest-2025.7.17.dev20250717.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
102
+ nv_ingest-2025.7.17.dev20250717.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
103
+ nv_ingest-2025.7.17.dev20250717.dist-info/RECORD,,