nv-ingest-api 2025.10.4.dev20251004__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (34) hide show
  1. nv_ingest_api/internal/extract/image/chart_extractor.py +7 -3
  2. nv_ingest_api/internal/extract/image/infographic_extractor.py +7 -3
  3. nv_ingest_api/internal/extract/image/table_extractor.py +7 -3
  4. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
  5. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +11 -4
  6. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
  7. nv_ingest_api/internal/primitives/nim/nim_client.py +158 -15
  8. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
  9. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
  10. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
  11. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
  12. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
  13. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
  14. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
  15. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
  16. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +56 -1
  17. nv_ingest_api/internal/schemas/meta/metadata_schema.py +9 -0
  18. nv_ingest_api/internal/schemas/mixins.py +39 -0
  19. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  20. nv_ingest_api/internal/transform/embed_text.py +82 -0
  21. nv_ingest_api/util/dataloader/dataloader.py +20 -9
  22. nv_ingest_api/util/image_processing/transforms.py +67 -1
  23. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  24. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +1 -0
  25. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +8 -2
  26. nv_ingest_api/util/service_clients/redis/redis_client.py +160 -0
  27. nv_ingest_api/util/service_clients/rest/rest_client.py +42 -3
  28. nv_ingest_api/util/string_processing/yaml.py +41 -4
  29. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/METADATA +2 -1
  30. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/RECORD +34 -32
  31. udfs/llm_summarizer_udf.py +132 -137
  32. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  33. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  34. {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -20,10 +20,10 @@ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha
20
20
  nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
21
  nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
22
22
  nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
23
- nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=Pojiu7R1BT8tUUzD5DsF-dDEwakz1ZfUrL_agalUsNc,13591
23
+ nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=V-unsKvSAJo7QqsDDm_OsZxzGpRwhVGFbc_47zX3hHs,13764
24
24
  nv_ingest_api/internal/extract/image/image_extractor.py,sha256=gBKjlx28hA_e-dupatu46YQgOHJ0DLpAWxREiLaZLyo,9039
25
- nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=gP-WiBIHruDmNFchq4BbVAci3XStMtyeN99M8dLm1j4,10225
26
- nv_ingest_api/internal/extract/image/table_extractor.py,sha256=T80-Smkf54Y5OkSaOquXpcoLbAf5uMnV-LOsBgD0L7E,14440
25
+ nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=UVYkxau53XsTDtYO8Pjt76f4lGLBHltXSCgkp1NBsIs,10398
26
+ nv_ingest_api/internal/extract/image/table_extractor.py,sha256=1z79OM_a9V-91oPXVNw6I388m68HZ4kXwAJw07FLicY,14613
27
27
  nv_ingest_api/internal/extract/image/image_helpers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
28
28
  nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=VhqjsBqvUz-2y92t6iryVERTuRfcGUdTHOOScYr8GLo,14916
29
29
  nv_ingest_api/internal/extract/pdf/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -32,7 +32,7 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
32
32
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
33
33
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=MwzM-n2tu0FHM0wDe_0mONLlzHrPte7EOTuPtzCh7Zs,8384
34
34
  nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=IVbNcH_phMiRSxnkZ04pGfQrPJ-x1zVR3hXyhxv7juc,22977
35
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=CCfxcHAS3mED8zD6GKTGNUi02CzBMs7FsSopevhsiyk,22720
35
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=yAndWwh_k00nP0spYGxlewP3RBPxE4QR-b3U3VgXnBo,22852
36
36
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
37
37
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
38
38
  nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=uTPTUTWQsGM1oeTUo49_hzwC5Yy9iEokrnS3z3WvtIo,5988
@@ -50,7 +50,7 @@ nv_ingest_api/internal/primitives/control_message_task.py,sha256=nWVB3QsP6p8BKwH
50
50
  nv_ingest_api/internal/primitives/ingest_control_message.py,sha256=8rA0UbPDSB3avReAKNxiUa_FCy7fIQpqk6tfmcYUibA,9879
51
51
  nv_ingest_api/internal/primitives/nim/__init__.py,sha256=-dFBTHQnMKV0yc5tfSqIT-rkJXKtpcmyUfTPs8TJAi8,339
52
52
  nv_ingest_api/internal/primitives/nim/default_values.py,sha256=W92XjfyeC6uuVxut6J7p00x1kpNsnXIDb97gSVytZJk,380
53
- nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kQAHWwZ6kjTVYZSfa0qRyIOFcqrhMe8LUygGtgzAly0,26321
53
+ nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kT-JP9jbkXzotS7EeajTgfMbFWhMoD8o2JtOLYu1JuU,32770
54
54
  nv_ingest_api/internal/primitives/nim/nim_model_interface.py,sha256=gWhyR33mIgEOYirq53WOk1bRl1SL0C_SVrM4w1-JmKU,4166
55
55
  nv_ingest_api/internal/primitives/nim/model_interface/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
56
56
  nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1ExW5V6pXC1ZiHdobeG_BmbPr3rBbVJef13s,11003
@@ -58,8 +58,8 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
58
58
  nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=jqbEbavvr9giODpzsGQSRDu5yZ4YfNfKAQfqUm9yUDI,11698
60
60
  nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
61
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=QOjKEJaL7Z_aT-luyV4eJSNQX4o-a9-P0CB0ZwSxFk4,29282
62
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=3XXJkeJaVFe_iIfNn_bDYn79JN20besjZHiNZ5dEnZQ,12778
61
+ nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=WuX-veTC510TbvMWYGYP6WCzjYCbUBAUc5ovJUWCrFU,29607
62
+ nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=6M3yxZHEyRX5sCDz-82SIPKw00TabYWxx-ZUEvGKrQQ,12920
63
63
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
64
64
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=sUDKKlVqKjiHEGr2D04I7S4pDfnLR8b-NplV1pf5GVQ,6240
65
65
  nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=zpfEZIPctWhNfREnP6e77zffU8vs_RfnMprBj-2jXXk,42847
@@ -68,24 +68,25 @@ nv_ingest_api/internal/primitives/tracing/latency.py,sha256=5kVTeYRbRdTlT_aI4MeS
68
68
  nv_ingest_api/internal/primitives/tracing/logging.py,sha256=SSzIgS7afLH-e1C7VagYDmkkA6rTXmQ-bmtLjoEguhg,3851
69
69
  nv_ingest_api/internal/primitives/tracing/tagging.py,sha256=xU534rb94uKnsSu0_DzyZcCSkIpa5SWTMxX7NSA3HoE,11671
70
70
  nv_ingest_api/internal/schemas/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
71
+ nv_ingest_api/internal/schemas/mixins.py,sha256=QyAhZATodR0qWgcq6HxO3yTd31X1nxZfxvpNrXfaY9w,1236
71
72
  nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
72
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=3cjLcw5zFUb7WNbGPLDJN2KukF_hoOM7PE33UHici6w,3873
73
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=PZFJPLrLs8k5I5ufnp0XWrBjmbQVkkaxjb-xq-2rn2Q,4317
74
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=Bafw6lIXLS2PcEpU82D4Vb0OPD_FvGSr546IedsfR8o,3795
73
+ nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=fl08h6XBjKRtm1LJlTJpH3s_LGoaKmuq8duBmHrXzXc,3974
74
+ nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=24zXz08DD-OXiOhjZPfv9fxIki8-c50xt3kH5JemE8I,4787
75
+ nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=qSHCIFWx6w5NTguWrgTXRD3i-H2nRMxjoNTw2IsF5Zg,3896
75
76
  nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
76
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=hXiHIKIZS2qb4u8g10m_S5CpeAzHvH3tSB_He5LYOfU,3800
77
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=Yb580-k1oYXanWMA17u0LGO5-AzUP80j6aTPGpjn920,4007
78
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=TTFzlPG5r1QFO2B5VWBPF69IXP1eQKrg16wo49hHDV0,6650
79
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py,sha256=6KomEL6wdhpZVnE2SiOeZJNhyfHIQ2sARdfm0R16uCM,3795
80
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=npBEGXkVaShOiHrc8FQ-25kHhPRD8WB6dRn-T5TScKc,3969
77
+ nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=NJAg1m-CPHAs4BMW8P09qL9sxS3ARBZOMpeNewFc5I8,3901
78
+ nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=e65tQs6zSo7P7Wl9KZJznomvBrS1PmLX9f0IjytszO4,4477
79
+ nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=J7J-rO6RO31777m9RKSdcOZhIwgIS7P9Y4M6WBaUYWs,6778
80
+ nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py,sha256=vuHK2qf-I41iDgnuXc08dol45LR5qcb0lYi1BQbZa74,3896
81
+ nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=ZMH0aR4fl4BMP5ZJq_J6R9Bq5VuHPxaS76bm9cMZn84,4438
81
82
  nv_ingest_api/internal/schemas/message_brokers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
82
83
  nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py,sha256=4xTSFE_vH7yZE9RRJRflFAG9hNXIaF6K020M_xA7ylw,1351
83
84
  nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDxTamVFqTQs2Yd8uvWyPE5mddHAWSU4PtfEIQ,966
84
85
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
85
86
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
86
87
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
87
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=cIpoesvIs0dR6s8dGjGHL246k5kf7hDmdhA48i8Si7s,10253
88
- nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=FDD6yq-QxW8yDwn0Bq6bmWakX41ABMn3cytrvCbT-Po,11961
88
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=auvKHFJm9FquYRS6Ro7GawvgNhszT-1uG3ADMy4E_B8,12240
89
+ nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=nHS2PwYE7YwuTUotvUd0hP8a-5f9uefy6_G3mMH4UyQ,12321
89
90
  nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
90
91
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
91
92
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
@@ -95,14 +96,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
95
96
  nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
96
97
  nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=fRMRwcWP-L8sfv2enNDt_W_CL0eC2i3b_1VCCtmr1K8,1188
97
98
  nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
98
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=rzdhRANCqG9mOEoLargznuBwj1-MbEQUu2LDVi5vl50,1616
99
+ nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=gWED1Q861onhSmDIESX1ZG1BCyelhzNTepyb5ZZuFXc,1738
99
100
  nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
100
101
  nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
101
102
  nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
102
103
  nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
103
104
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
104
105
  nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
105
- nv_ingest_api/internal/transform/embed_text.py,sha256=LB_2Zvw6plc7uOWT2QN13aDu2qFumXzl_RB3ZcZbLGs,20191
106
+ nv_ingest_api/internal/transform/embed_text.py,sha256=F3G1zVFDJMYYZkly7bb6w5bgbUed9sv5sDd02JOF3no,23163
106
107
  nv_ingest_api/internal/transform/split_text.py,sha256=LAtInGVuydH43UwjNMQWFVC1A6NdhXP_dZup2xX4qEo,7745
107
108
  nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
108
109
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -115,7 +116,7 @@ nv_ingest_api/util/converters/dftools.py,sha256=FjHjazIeiUd1LdFwWuummJmraqZe1a90
115
116
  nv_ingest_api/util/converters/formats.py,sha256=L11FtormO2SeHSebbwsGE_uuCv6Jk0D3VvVW2avU0vI,2258
116
117
  nv_ingest_api/util/converters/type_mappings.py,sha256=5TVXRyU6BlQvFOdqknEuQw3ss4PXeCvSUynJnjvgQpA,1102
117
118
  nv_ingest_api/util/dataloader/__init__.py,sha256=B6ybDORMI9IzXGdhM7w_agcVj1BNYgAlcfTA0lG5jng,308
118
- nv_ingest_api/util/dataloader/dataloader.py,sha256=r_TU-RfdYerl3k3jRsGIVByxejwz-UQDuallx5-YAGM,14790
119
+ nv_ingest_api/util/dataloader/dataloader.py,sha256=1SG0cHKo7X_eBRTVMJ9EFJOfpqe37QqfEoMxeAWxkEU,15124
119
120
  nv_ingest_api/util/detectors/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
120
121
  nv_ingest_api/util/detectors/language.py,sha256=TvzcESYY0bn0U4aLY6GjB4VaCWA6XrXxAGZbVzHTMuE,965
121
122
  nv_ingest_api/util/exception_handlers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -128,7 +129,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
128
129
  nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
129
130
  nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
130
131
  nv_ingest_api/util/image_processing/table_and_chart.py,sha256=idCIjiLkY-usI2EARchg3omWLtIYmYA-1tdUUV2lbno,16338
131
- nv_ingest_api/util/image_processing/transforms.py,sha256=ygIBf-EWm4mqi1qcnLb-l6TZps8gjotZakxj8ktxdYU,27730
132
+ nv_ingest_api/util/image_processing/transforms.py,sha256=Mj0ry3DzCKY83ZNfvNuAIQBSkRvsxHPe0VAHRJb2BfA,30136
132
133
  nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
133
134
  nv_ingest_api/util/imports/callable_signatures.py,sha256=ipzXNZJpfu7oeTBrQz2h6zrFVIQaqb2KBpzSuIX3u-Y,4138
134
135
  nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=qy7RpmBZrXJarOQl3J7jiCKnbZMNChXTL_Z-H4c9zlc,6170
@@ -139,14 +140,15 @@ nv_ingest_api/util/logging/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
139
140
  nv_ingest_api/util/logging/configuration.py,sha256=05KR3LOS-PCqU-Io__iiKG_Ds730eKxciklFfNeId3w,3126
140
141
  nv_ingest_api/util/logging/sanitize.py,sha256=-dIbmvLTevrTRd18QKUQQMV4hBk6pStWP_7_VtDDctg,2584
141
142
  nv_ingest_api/util/message_brokers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
143
+ nv_ingest_api/util/message_brokers/qos_scheduler.py,sha256=TdpjRyUfqR9y1v9SNxZaIN9ZgxVMlEGvLFhfUD7jjO8,10339
142
144
  nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py,sha256=WaQ3CWIpIKWEivT5kL-bkmzcSQKLGFNFHdXHUJjqZFs,325
143
145
  nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=PekxaxVcAa9k1wgUtozlr04SW3sAeqYJE-wdVBZf9eo,17264
144
146
  nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
145
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=CCRAbq2EBH2quX9UTfuBbz3tTMDnWqhEF33roFwbyuk,16484
147
+ nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=BFuegFsbU_YB_98gzhs8oU2by4_iVzIDanT0nJdjJ7g,16517
146
148
  nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
147
149
  nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
148
150
  nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
149
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=dTfP82DgGPaXEJH3jywTO8rNlLZUniD4FFzwv84_giE,7372
151
+ nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=34O7I8Lin5GvO_zNZGbsqEGkDvIbqy_0Eh3ejoPNDVE,7501
150
152
  nv_ingest_api/util/nim/__init__.py,sha256=No45pMstom1Jo0EENT6VEFkZn3YmTha7lYaBZU7xtHk,2116
151
153
  nv_ingest_api/util/pdf/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
152
154
  nv_ingest_api/util/pdf/pdfium.py,sha256=1aPCnPKXHWnncYoMO8HllYjrhODSXIeRBIsSLDevpYs,15667
@@ -156,18 +158,18 @@ nv_ingest_api/util/service_clients/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
156
158
  nv_ingest_api/util/service_clients/client_base.py,sha256=eCOeq3Rr6Xnnsh-oHszYlQTOffQyzsT8s43V4V8H_h8,2716
157
159
  nv_ingest_api/util/service_clients/kafka/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
158
160
  nv_ingest_api/util/service_clients/redis/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
159
- nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=3NLecvIvVN1v-sA7d7G-_f6qJVZyfJE2H8Iu5KG3Aew,37417
161
+ nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=b7rqJKYW27lmuSjTTho1sO2-q093cfeXARx8JgCHZ-o,44042
160
162
  nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
- nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFSNTf7psoOpLREiLN5ezpHFW0HI,21732
163
+ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=7ymPxhuN9SP8nPSVepqqbvUxXPaTVunq2aC2bDbg98g,23684
162
164
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
163
165
  nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jfXRo9_M6hCZ59OxKLxG_47HRY,29888
164
- nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
166
+ nv_ingest_api/util/string_processing/yaml.py,sha256=4Zdmc4474lUZn6kznqaNTlQJwsmRnnJQZ-DvAWLu-zo,2678
165
167
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
168
  nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
167
- nv_ingest_api-2025.10.4.dev20251004.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
169
+ nv_ingest_api-2025.11.2.dev20251102.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
168
170
  udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
169
- udfs/llm_summarizer_udf.py,sha256=sIMfcH4GRyciTKUtq4dmhd6fZmAp07X32irIC4k7nEI,7316
170
- nv_ingest_api-2025.10.4.dev20251004.dist-info/METADATA,sha256=we6OaOQesMnj6J87Fg0W5ZdYcsJ6zOE-30dbrVnoLrI,14085
171
- nv_ingest_api-2025.10.4.dev20251004.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
172
- nv_ingest_api-2025.10.4.dev20251004.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
173
- nv_ingest_api-2025.10.4.dev20251004.dist-info/RECORD,,
171
+ udfs/llm_summarizer_udf.py,sha256=lH5c5NHoT-5ecHC3og_40u1Ujta8SpsKU4X0e4wzbMU,7314
172
+ nv_ingest_api-2025.11.2.dev20251102.dist-info/METADATA,sha256=tE2TOo_c9GVqOG_deBDe46ps5gTi2Mr5_l1FupFhG2I,14105
173
+ nv_ingest_api-2025.11.2.dev20251102.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
174
+ nv_ingest_api-2025.11.2.dev20251102.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
175
+ nv_ingest_api-2025.11.2.dev20251102.dist-info/RECORD,,
@@ -2,22 +2,39 @@
2
2
  """
3
3
  LLM Content Summarizer UDF for NV-Ingest Pipeline
4
4
 
5
- This UDF uses an LLM API to generate concise summaries
6
- of text content chunks, adding AI-generated summaries to the metadata for
7
- enhanced downstream processing and search capabilities.
5
+ Generates document summaries using NVIDIA-hosted LLMs. This production UDF demonstrates how to extract the pipeline
6
+ payload, run custom code (summarization), and inject results into the metadata for downstream usecases (such as
7
+ retrieval).
8
8
 
9
- Environment Variables:
9
+ These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
10
10
  - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
11
11
  - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
12
- - LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
13
- - LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
14
- - LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
15
- - LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
12
+ - LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
13
+ - TIMEOUT: API timeout in seconds (default: 60)
14
+ - MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
15
+ - MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
16
+ TODO: Implement this
17
+ - NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
18
+
19
+ More info can be found in `examples/udfs/README.md`
16
20
  """
17
21
 
18
- import os
19
22
  import logging
20
- from typing import Optional
23
+ import os
24
+ import time
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ PROMPT = """
30
+ Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
31
+ and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
32
+ This summary will be used for document search and understanding.
33
+
34
+ [CONTENT]
35
+ {content}
36
+ [END CONTENT]
37
+ """
21
38
 
22
39
 
23
40
  def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
@@ -27,13 +44,6 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
27
44
  This function processes text primitives and generates concise summaries using
28
45
  an LLM API, storing the results in the metadata's custom_content field.
29
46
 
30
- Features:
31
- - Flexible content detection across multiple metadata locations
32
- - Robust error handling with graceful fallbacks
33
- - Comprehensive logging for monitoring and debugging
34
- - Configurable content length thresholds
35
- - Safe metadata manipulation preserving existing data
36
-
37
47
  Parameters
38
48
  ----------
39
49
  control_message : IngestControlMessage
@@ -44,167 +54,152 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
44
54
  IngestControlMessage
45
55
  The modified control message with LLM summaries added to metadata
46
56
  """
47
- from openai import OpenAI
48
-
49
- logger = logging.getLogger(__name__)
50
57
  logger.info("UDF: Starting LLM content summarization")
51
58
 
52
- # Get configuration from environment
53
- api_key = os.getenv("NVIDIA_API_KEY", "")
59
+ api_key = os.getenv("NVIDIA_API_KEY")
54
60
  model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
55
61
  base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
56
- timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
57
- min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
58
- max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
62
+ min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
63
+ max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
64
+ timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
65
+
66
+ stats = {
67
+ "skipped": False,
68
+ "failed": False,
69
+ "tokens": 0,
70
+ "duration": 0.0,
71
+ }
59
72
 
60
73
  if not api_key:
61
- logger.warning("NVIDIA_API_KEY not found, skipping summarization")
74
+ logger.error("NVIDIA_API_KEY not set. Skipping...")
62
75
  return control_message
63
76
 
64
- # Get the DataFrame payload
65
77
  df = control_message.payload()
66
- if df is None or len(df) == 0:
67
- logger.warning("No payload found in control message")
68
- return control_message
69
-
70
- logger.info(f"Processing {len(df)} rows for LLM summarization")
71
78
 
72
- # Initialize OpenAI client with error handling
73
- try:
74
- client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
75
- except Exception as e:
76
- logger.error(f"Failed to initialize OpenAI client: {e}")
79
+ if df is None or df.empty:
80
+ logger.warning("No payload found. Nothing to summarize.")
77
81
  return control_message
78
82
 
79
- # Stats for reporting
80
- stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
81
-
82
- # Process each row
83
- for idx, row in df.iterrows():
84
- stats["processed"] += 1
85
-
86
- try:
87
- # Extract content - be more flexible about where it comes from
88
- content = _extract_content(row, logger)
89
-
90
- if not content:
91
- stats["skipped"] += 1
92
- continue
93
-
94
- content = content.strip()
95
- if len(content) < min_content_length:
96
- stats["skipped"] += 1
97
- continue
98
-
99
- # Truncate if needed
100
- if len(content) > max_content_length:
101
- content = content[:max_content_length]
102
-
103
- # Generate summary
104
- summary = _generate_summary(client, content, model_name, logger)
83
+ # Select first and last chunk for summarization
84
+ # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
85
+ # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
86
+ # pages, it must require parsing the payload to see which chunks correspond to which pages
87
+ original_df = df.copy()
88
+ if len(df) > 1:
89
+ # TODO: add feature to select N first and last chunks
90
+ df = df.iloc[[0, -1]]
91
+ else:
92
+ logger.info("Document has only one chunk")
93
+
94
+ # Combine all content into a single string
95
+ content_list = df.apply(
96
+ _extract_content,
97
+ axis=1,
98
+ min_content_length=min_content_length,
99
+ max_content_length=max_content_length,
100
+ stats=stats,
101
+ )
102
+ content = " ".join(content_list)
105
103
 
106
- if summary:
107
- # Add to metadata
108
- _add_summary(df, idx, row, summary, model_name, logger)
109
- stats["summarized"] += 1
110
- else:
111
- stats["failed"] += 1
104
+ # Nicely ask LLM to summarize content
105
+ summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
112
106
 
113
- except Exception as e:
114
- stats["failed"] += 1
115
- logger.error(f"Row {idx}: Error processing content: {e}")
107
+ stats["failed"] = summary is None
108
+ if not stats["failed"]:
109
+ stats["tokens"] = _estimate_tokens(content)
110
+ logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
111
+ _store_summary(original_df, summary, model_name)
116
112
 
117
- # Update the control message with modified DataFrame
118
- control_message.payload(df)
113
+ # Update the control message with modified DataFrame
114
+ control_message.payload(original_df)
119
115
 
120
- logger.info(
121
- f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
122
- f"{stats['skipped']} skipped, {stats['failed']} failed"
123
- )
116
+ else:
117
+ logger.warning("%s failed to summarize content", model_name)
124
118
 
125
119
  return control_message
126
120
 
127
121
 
128
- def _extract_content(row, logger) -> Optional[str]:
129
- """Extract text content from row, trying multiple locations."""
130
- content = ""
131
-
132
- # Try different locations for content
133
- if isinstance(row.get("metadata"), dict):
134
- metadata = row["metadata"]
135
-
136
- # Primary location: metadata.content
137
- content = metadata.get("content", "")
122
+ def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
123
+ """Extract text content from row"""
124
+ metadata = row.get("metadata")
138
125
 
139
- # If no content, try other locations
140
- if not content:
141
- # Try in text_metadata
142
- text_metadata = metadata.get("text_metadata", {})
143
- content = text_metadata.get("text", "") or text_metadata.get("content", "")
144
-
145
- # Try top-level content field
146
- if not content:
147
- content = row.get("content", "")
126
+ if isinstance(metadata, dict):
127
+ content = metadata.get("content")
128
+ if content is not None:
129
+ content = content.strip()
130
+ if len(content) < min_content_length:
131
+ stats["skipped"] = True
132
+ logger.warning(f"Content less than min={min_content_length}. Skipping...")
133
+ content = ""
134
+ elif len(content) > max_content_length:
135
+ logger.warning(f"Truncating content to {max_content_length} characters")
136
+ content = content[:max_content_length]
137
+ else:
138
+ stats["skipped"] = True
139
+ content = ""
148
140
 
149
- if not content:
150
- return None
141
+ else:
142
+ stats["skipped"] = True
143
+ logger.warning("No metadata found. Skipping...")
144
+ content = ""
151
145
 
152
146
  return content
153
147
 
154
148
 
155
- def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
156
- """Generate summary with robust error handling."""
157
- prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
158
-
159
- {content}
160
-
161
- Focus on the main purpose, key topics, and important details.
162
- This summary will be used for document search and understanding.
163
-
164
- Summary:"""
149
+ def _generate_llm_summary(
150
+ content: str,
151
+ model_name: str,
152
+ base_url: str,
153
+ api_key: str,
154
+ timeout: int,
155
+ ) -> tuple[str | None, float]:
156
+ """Ask an LLM to summarize content extracted from doc."""
165
157
 
158
+ start_time = time.time()
166
159
  try:
160
+ from openai import OpenAI
161
+
162
+ client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
163
+ start_time = time.time()
167
164
  completion = client.chat.completions.create(
168
165
  model=model_name,
169
- messages=[{"role": "user", "content": prompt}],
166
+ messages=[{"role": "user", "content": PROMPT.format(content=content)}],
170
167
  max_tokens=400, # Increased for more comprehensive summaries
171
168
  temperature=0.7,
172
169
  )
170
+ duration = time.time() - start_time
173
171
 
174
- if completion.choices and len(completion.choices) > 0:
172
+ if completion.choices:
175
173
  summary = completion.choices[0].message.content.strip()
176
- return summary
177
- else:
178
- return None
174
+ return summary, duration
175
+ return None, duration
179
176
 
180
177
  except Exception as e:
181
178
  logger.error(f"API call failed: {e}")
182
- return None
179
+ # TODO: GitHub Thread
180
+ # Reviewers, tell me if this is a bad idea.
181
+ # I think the convention is to return timestamp for time even if it fails
182
+ return None, time.time() - start_time
183
183
 
184
184
 
185
- def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
186
- """Add summary to metadata with safe handling."""
187
- try:
188
- # Get current metadata or create new dict - handle None case properly
189
- existing_metadata = row.get("metadata")
190
- if existing_metadata is not None and isinstance(existing_metadata, dict):
191
- metadata = dict(existing_metadata) # Create a copy
192
- else:
193
- metadata = {}
185
+ def _store_summary(df, summary: str, model_name: str):
186
+ """Add summary to metadata and store in df"""
187
+ # hardcoded heuristic to store everything on chunk 0's metadata
188
+ row_0 = df.iloc[0]
194
189
 
195
- # Ensure custom_content exists
196
- if "custom_content" not in metadata or metadata["custom_content"] is None:
197
- metadata["custom_content"] = {}
190
+ # this is a reference to a dictionary that is stored in the dataframe
191
+ # and is modified in place
192
+ metadata = row_0.get("metadata")
198
193
 
199
- # Add LLM summary
200
- metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
194
+ if metadata.get("custom_content") is None:
195
+ metadata["custom_content"] = {}
196
+ metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
201
197
 
202
- # Update the DataFrame at the specific index
203
- try:
204
- df.at[idx, "metadata"] = metadata
205
- except Exception:
206
- # Alternative approach: update the original row reference
207
- df.iloc[idx]["metadata"] = metadata
208
198
 
209
- except Exception as e:
210
- logger.error(f"Failed to add summary to row {idx}: {e}")
199
+ def _estimate_tokens(text: str) -> int:
200
+ """Rough estimate (~4 characters per token)"""
201
+ return len(text) // 4
202
+
203
+
204
+ def _safe_model_name(name: str) -> str:
205
+ return name.replace("/", "__").replace("-", "_")