nv-ingest-api 2025.10.4.dev20251004__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/extract/image/chart_extractor.py +7 -3
- nv_ingest_api/internal/extract/image/infographic_extractor.py +7 -3
- nv_ingest_api/internal/extract/image/table_extractor.py +7 -3
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +11 -4
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +158 -15
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +56 -1
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +9 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +82 -0
- nv_ingest_api/util/dataloader/dataloader.py +20 -9
- nv_ingest_api/util/image_processing/transforms.py +67 -1
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +1 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +8 -2
- nv_ingest_api/util/service_clients/redis/redis_client.py +160 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +42 -3
- nv_ingest_api/util/string_processing/yaml.py +41 -4
- {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/METADATA +2 -1
- {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/RECORD +34 -32
- udfs/llm_summarizer_udf.py +132 -137
- {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.10.4.dev20251004.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -20,10 +20,10 @@ nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py,sha
|
|
|
20
20
|
nv_ingest_api/internal/extract/html/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
21
21
|
nv_ingest_api/internal/extract/html/html_extractor.py,sha256=I9oWfj6_As4898GDDh0zsSuKxO3lBsvyYzhvUotjzJI,3282
|
|
22
22
|
nv_ingest_api/internal/extract/image/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
23
|
-
nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=
|
|
23
|
+
nv_ingest_api/internal/extract/image/chart_extractor.py,sha256=V-unsKvSAJo7QqsDDm_OsZxzGpRwhVGFbc_47zX3hHs,13764
|
|
24
24
|
nv_ingest_api/internal/extract/image/image_extractor.py,sha256=gBKjlx28hA_e-dupatu46YQgOHJ0DLpAWxREiLaZLyo,9039
|
|
25
|
-
nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=
|
|
26
|
-
nv_ingest_api/internal/extract/image/table_extractor.py,sha256=
|
|
25
|
+
nv_ingest_api/internal/extract/image/infographic_extractor.py,sha256=UVYkxau53XsTDtYO8Pjt76f4lGLBHltXSCgkp1NBsIs,10398
|
|
26
|
+
nv_ingest_api/internal/extract/image/table_extractor.py,sha256=1z79OM_a9V-91oPXVNw6I388m68HZ4kXwAJw07FLicY,14613
|
|
27
27
|
nv_ingest_api/internal/extract/image/image_helpers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
28
28
|
nv_ingest_api/internal/extract/image/image_helpers/common.py,sha256=VhqjsBqvUz-2y92t6iryVERTuRfcGUdTHOOScYr8GLo,14916
|
|
29
29
|
nv_ingest_api/internal/extract/pdf/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
@@ -32,7 +32,7 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
|
|
|
32
32
|
nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
|
|
33
33
|
nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=MwzM-n2tu0FHM0wDe_0mONLlzHrPte7EOTuPtzCh7Zs,8384
|
|
34
34
|
nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=IVbNcH_phMiRSxnkZ04pGfQrPJ-x1zVR3hXyhxv7juc,22977
|
|
35
|
-
nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=
|
|
35
|
+
nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=yAndWwh_k00nP0spYGxlewP3RBPxE4QR-b3U3VgXnBo,22852
|
|
36
36
|
nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
|
|
37
37
|
nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
|
|
38
38
|
nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=uTPTUTWQsGM1oeTUo49_hzwC5Yy9iEokrnS3z3WvtIo,5988
|
|
@@ -50,7 +50,7 @@ nv_ingest_api/internal/primitives/control_message_task.py,sha256=nWVB3QsP6p8BKwH
|
|
|
50
50
|
nv_ingest_api/internal/primitives/ingest_control_message.py,sha256=8rA0UbPDSB3avReAKNxiUa_FCy7fIQpqk6tfmcYUibA,9879
|
|
51
51
|
nv_ingest_api/internal/primitives/nim/__init__.py,sha256=-dFBTHQnMKV0yc5tfSqIT-rkJXKtpcmyUfTPs8TJAi8,339
|
|
52
52
|
nv_ingest_api/internal/primitives/nim/default_values.py,sha256=W92XjfyeC6uuVxut6J7p00x1kpNsnXIDb97gSVytZJk,380
|
|
53
|
-
nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=
|
|
53
|
+
nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kT-JP9jbkXzotS7EeajTgfMbFWhMoD8o2JtOLYu1JuU,32770
|
|
54
54
|
nv_ingest_api/internal/primitives/nim/nim_model_interface.py,sha256=gWhyR33mIgEOYirq53WOk1bRl1SL0C_SVrM4w1-JmKU,4166
|
|
55
55
|
nv_ingest_api/internal/primitives/nim/model_interface/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
56
56
|
nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1ExW5V6pXC1ZiHdobeG_BmbPr3rBbVJef13s,11003
|
|
@@ -58,8 +58,8 @@ nv_ingest_api/internal/primitives/nim/model_interface/decorators.py,sha256=qwubk
|
|
|
58
58
|
nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTuI1WNhRmNNrvygaI_DIutkJkDL-XdtLZQac,10787
|
|
59
59
|
nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=jqbEbavvr9giODpzsGQSRDu5yZ4YfNfKAQfqUm9yUDI,11698
|
|
60
60
|
nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=WysjDZeegclO3mZgVcGOwzWbr8wSI4pWRiYD4iC2EXo,7098
|
|
61
|
-
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=
|
|
62
|
-
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=
|
|
61
|
+
nv_ingest_api/internal/primitives/nim/model_interface/ocr.py,sha256=WuX-veTC510TbvMWYGYP6WCzjYCbUBAUc5ovJUWCrFU,29607
|
|
62
|
+
nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=6M3yxZHEyRX5sCDz-82SIPKw00TabYWxx-ZUEvGKrQQ,12920
|
|
63
63
|
nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
|
|
64
64
|
nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=sUDKKlVqKjiHEGr2D04I7S4pDfnLR8b-NplV1pf5GVQ,6240
|
|
65
65
|
nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=zpfEZIPctWhNfREnP6e77zffU8vs_RfnMprBj-2jXXk,42847
|
|
@@ -68,24 +68,25 @@ nv_ingest_api/internal/primitives/tracing/latency.py,sha256=5kVTeYRbRdTlT_aI4MeS
|
|
|
68
68
|
nv_ingest_api/internal/primitives/tracing/logging.py,sha256=SSzIgS7afLH-e1C7VagYDmkkA6rTXmQ-bmtLjoEguhg,3851
|
|
69
69
|
nv_ingest_api/internal/primitives/tracing/tagging.py,sha256=xU534rb94uKnsSu0_DzyZcCSkIpa5SWTMxX7NSA3HoE,11671
|
|
70
70
|
nv_ingest_api/internal/schemas/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
71
|
+
nv_ingest_api/internal/schemas/mixins.py,sha256=QyAhZATodR0qWgcq6HxO3yTd31X1nxZfxvpNrXfaY9w,1236
|
|
71
72
|
nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
72
|
-
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=
|
|
73
|
-
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=
|
|
74
|
-
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=
|
|
73
|
+
nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=fl08h6XBjKRtm1LJlTJpH3s_LGoaKmuq8duBmHrXzXc,3974
|
|
74
|
+
nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=24zXz08DD-OXiOhjZPfv9fxIki8-c50xt3kH5JemE8I,4787
|
|
75
|
+
nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=qSHCIFWx6w5NTguWrgTXRD3i-H2nRMxjoNTw2IsF5Zg,3896
|
|
75
76
|
nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
|
|
76
|
-
nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=
|
|
77
|
-
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=
|
|
78
|
-
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=
|
|
79
|
-
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py,sha256=
|
|
80
|
-
nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=
|
|
77
|
+
nv_ingest_api/internal/schemas/extract/extract_image_schema.py,sha256=NJAg1m-CPHAs4BMW8P09qL9sxS3ARBZOMpeNewFc5I8,3901
|
|
78
|
+
nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py,sha256=e65tQs6zSo7P7Wl9KZJznomvBrS1PmLX9f0IjytszO4,4477
|
|
79
|
+
nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py,sha256=J7J-rO6RO31777m9RKSdcOZhIwgIS7P9Y4M6WBaUYWs,6778
|
|
80
|
+
nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py,sha256=vuHK2qf-I41iDgnuXc08dol45LR5qcb0lYi1BQbZa74,3896
|
|
81
|
+
nv_ingest_api/internal/schemas/extract/extract_table_schema.py,sha256=ZMH0aR4fl4BMP5ZJq_J6R9Bq5VuHPxaS76bm9cMZn84,4438
|
|
81
82
|
nv_ingest_api/internal/schemas/message_brokers/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
82
83
|
nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py,sha256=4xTSFE_vH7yZE9RRJRflFAG9hNXIaF6K020M_xA7ylw,1351
|
|
83
84
|
nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDxTamVFqTQs2Yd8uvWyPE5mddHAWSU4PtfEIQ,966
|
|
84
85
|
nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
|
|
85
86
|
nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
86
87
|
nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
|
|
87
|
-
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=
|
|
88
|
-
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=
|
|
88
|
+
nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=auvKHFJm9FquYRS6Ro7GawvgNhszT-1uG3ADMy4E_B8,12240
|
|
89
|
+
nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=nHS2PwYE7YwuTUotvUd0hP8a-5f9uefy6_G3mMH4UyQ,12321
|
|
89
90
|
nv_ingest_api/internal/schemas/meta/udf.py,sha256=GgzqbZOlipQgMpDhbXLqbF8xrHenj_hMNqhR_P-1ynw,779
|
|
90
91
|
nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
91
92
|
nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
|
|
@@ -95,14 +96,14 @@ nv_ingest_api/internal/schemas/store/store_image_schema.py,sha256=p2LGij9i6sG6RY
|
|
|
95
96
|
nv_ingest_api/internal/schemas/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
96
97
|
nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py,sha256=fRMRwcWP-L8sfv2enNDt_W_CL0eC2i3b_1VCCtmr1K8,1188
|
|
97
98
|
nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py,sha256=31ThI5fr0yyENeJeE1xMAA-pxk1QVJLwM842zMate_k,429
|
|
98
|
-
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=
|
|
99
|
+
nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py,sha256=gWED1Q861onhSmDIESX1ZG1BCyelhzNTepyb5ZZuFXc,1738
|
|
99
100
|
nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py,sha256=D9K8tvu-tkEBQkZo7uuRzgrHdGyM3ZcNycHbHy5HV2E,791
|
|
100
101
|
nv_ingest_api/internal/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
101
102
|
nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZEFdJX5NxPG-p8kUbzV7I,9898
|
|
102
103
|
nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9PI25bkBn6Xn9h3I,9654
|
|
103
104
|
nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
104
105
|
nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
|
|
105
|
-
nv_ingest_api/internal/transform/embed_text.py,sha256=
|
|
106
|
+
nv_ingest_api/internal/transform/embed_text.py,sha256=F3G1zVFDJMYYZkly7bb6w5bgbUed9sv5sDd02JOF3no,23163
|
|
106
107
|
nv_ingest_api/internal/transform/split_text.py,sha256=LAtInGVuydH43UwjNMQWFVC1A6NdhXP_dZup2xX4qEo,7745
|
|
107
108
|
nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
108
109
|
nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -115,7 +116,7 @@ nv_ingest_api/util/converters/dftools.py,sha256=FjHjazIeiUd1LdFwWuummJmraqZe1a90
|
|
|
115
116
|
nv_ingest_api/util/converters/formats.py,sha256=L11FtormO2SeHSebbwsGE_uuCv6Jk0D3VvVW2avU0vI,2258
|
|
116
117
|
nv_ingest_api/util/converters/type_mappings.py,sha256=5TVXRyU6BlQvFOdqknEuQw3ss4PXeCvSUynJnjvgQpA,1102
|
|
117
118
|
nv_ingest_api/util/dataloader/__init__.py,sha256=B6ybDORMI9IzXGdhM7w_agcVj1BNYgAlcfTA0lG5jng,308
|
|
118
|
-
nv_ingest_api/util/dataloader/dataloader.py,sha256=
|
|
119
|
+
nv_ingest_api/util/dataloader/dataloader.py,sha256=1SG0cHKo7X_eBRTVMJ9EFJOfpqe37QqfEoMxeAWxkEU,15124
|
|
119
120
|
nv_ingest_api/util/detectors/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
|
|
120
121
|
nv_ingest_api/util/detectors/language.py,sha256=TvzcESYY0bn0U4aLY6GjB4VaCWA6XrXxAGZbVzHTMuE,965
|
|
121
122
|
nv_ingest_api/util/exception_handlers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -128,7 +129,7 @@ nv_ingest_api/util/image_processing/__init__.py,sha256=Jiy8C1ZuSrNb_eBM1ZTV9IKFI
|
|
|
128
129
|
nv_ingest_api/util/image_processing/clustering.py,sha256=sUGlZI4cx1q8h4Pns1N9JVpdfSM2BOH8zRmn9QFCtzI,9236
|
|
129
130
|
nv_ingest_api/util/image_processing/processing.py,sha256=LSoDDEmahr7a-qSS12McVcowRe3dOrAZwa1h-PD_JPQ,6554
|
|
130
131
|
nv_ingest_api/util/image_processing/table_and_chart.py,sha256=idCIjiLkY-usI2EARchg3omWLtIYmYA-1tdUUV2lbno,16338
|
|
131
|
-
nv_ingest_api/util/image_processing/transforms.py,sha256=
|
|
132
|
+
nv_ingest_api/util/image_processing/transforms.py,sha256=Mj0ry3DzCKY83ZNfvNuAIQBSkRvsxHPe0VAHRJb2BfA,30136
|
|
132
133
|
nv_ingest_api/util/imports/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
133
134
|
nv_ingest_api/util/imports/callable_signatures.py,sha256=ipzXNZJpfu7oeTBrQz2h6zrFVIQaqb2KBpzSuIX3u-Y,4138
|
|
134
135
|
nv_ingest_api/util/imports/dynamic_resolvers.py,sha256=qy7RpmBZrXJarOQl3J7jiCKnbZMNChXTL_Z-H4c9zlc,6170
|
|
@@ -139,14 +140,15 @@ nv_ingest_api/util/logging/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
|
139
140
|
nv_ingest_api/util/logging/configuration.py,sha256=05KR3LOS-PCqU-Io__iiKG_Ds730eKxciklFfNeId3w,3126
|
|
140
141
|
nv_ingest_api/util/logging/sanitize.py,sha256=-dIbmvLTevrTRd18QKUQQMV4hBk6pStWP_7_VtDDctg,2584
|
|
141
142
|
nv_ingest_api/util/message_brokers/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
143
|
+
nv_ingest_api/util/message_brokers/qos_scheduler.py,sha256=TdpjRyUfqR9y1v9SNxZaIN9ZgxVMlEGvLFhfUD7jjO8,10339
|
|
142
144
|
nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py,sha256=WaQ3CWIpIKWEivT5kL-bkmzcSQKLGFNFHdXHUJjqZFs,325
|
|
143
145
|
nv_ingest_api/util/message_brokers/simple_message_broker/broker.py,sha256=PekxaxVcAa9k1wgUtozlr04SW3sAeqYJE-wdVBZf9eo,17264
|
|
144
146
|
nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py,sha256=3p-LRqG8qLnsfEhBNf73_DG22C08JKahTqUvPLS2Apg,2554
|
|
145
|
-
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=
|
|
147
|
+
nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py,sha256=BFuegFsbU_YB_98gzhs8oU2by4_iVzIDanT0nJdjJ7g,16517
|
|
146
148
|
nv_ingest_api/util/metadata/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
|
|
147
149
|
nv_ingest_api/util/metadata/aggregators.py,sha256=YYdvJ1E04eGFZKKHUxXoH6mzLg8nor9Smvnv0qzqK5w,15988
|
|
148
150
|
nv_ingest_api/util/multi_processing/__init__.py,sha256=4fojP8Rp_5Hu1YAkqGylqTyEZ-HBVVEunn5Z9I99swA,242
|
|
149
|
-
nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=
|
|
151
|
+
nv_ingest_api/util/multi_processing/mp_pool_singleton.py,sha256=34O7I8Lin5GvO_zNZGbsqEGkDvIbqy_0Eh3ejoPNDVE,7501
|
|
150
152
|
nv_ingest_api/util/nim/__init__.py,sha256=No45pMstom1Jo0EENT6VEFkZn3YmTha7lYaBZU7xtHk,2116
|
|
151
153
|
nv_ingest_api/util/pdf/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
152
154
|
nv_ingest_api/util/pdf/pdfium.py,sha256=1aPCnPKXHWnncYoMO8HllYjrhODSXIeRBIsSLDevpYs,15667
|
|
@@ -156,18 +158,18 @@ nv_ingest_api/util/service_clients/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
|
|
|
156
158
|
nv_ingest_api/util/service_clients/client_base.py,sha256=eCOeq3Rr6Xnnsh-oHszYlQTOffQyzsT8s43V4V8H_h8,2716
|
|
157
159
|
nv_ingest_api/util/service_clients/kafka/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
|
|
158
160
|
nv_ingest_api/util/service_clients/redis/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
159
|
-
nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=
|
|
161
|
+
nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=b7rqJKYW27lmuSjTTho1sO2-q093cfeXARx8JgCHZ-o,44042
|
|
160
162
|
nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
|
-
nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=
|
|
163
|
+
nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=7ymPxhuN9SP8nPSVepqqbvUxXPaTVunq2aC2bDbg98g,23684
|
|
162
164
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
163
165
|
nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jfXRo9_M6hCZ59OxKLxG_47HRY,29888
|
|
164
|
-
nv_ingest_api/util/string_processing/yaml.py,sha256=
|
|
166
|
+
nv_ingest_api/util/string_processing/yaml.py,sha256=4Zdmc4474lUZn6kznqaNTlQJwsmRnnJQZ-DvAWLu-zo,2678
|
|
165
167
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
168
|
nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
|
|
167
|
-
nv_ingest_api-2025.
|
|
169
|
+
nv_ingest_api-2025.11.2.dev20251102.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
168
170
|
udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
|
|
169
|
-
udfs/llm_summarizer_udf.py,sha256=
|
|
170
|
-
nv_ingest_api-2025.
|
|
171
|
-
nv_ingest_api-2025.
|
|
172
|
-
nv_ingest_api-2025.
|
|
173
|
-
nv_ingest_api-2025.
|
|
171
|
+
udfs/llm_summarizer_udf.py,sha256=lH5c5NHoT-5ecHC3og_40u1Ujta8SpsKU4X0e4wzbMU,7314
|
|
172
|
+
nv_ingest_api-2025.11.2.dev20251102.dist-info/METADATA,sha256=tE2TOo_c9GVqOG_deBDe46ps5gTi2Mr5_l1FupFhG2I,14105
|
|
173
|
+
nv_ingest_api-2025.11.2.dev20251102.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
174
|
+
nv_ingest_api-2025.11.2.dev20251102.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
|
|
175
|
+
nv_ingest_api-2025.11.2.dev20251102.dist-info/RECORD,,
|
udfs/llm_summarizer_udf.py
CHANGED
|
@@ -2,22 +2,39 @@
|
|
|
2
2
|
"""
|
|
3
3
|
LLM Content Summarizer UDF for NV-Ingest Pipeline
|
|
4
4
|
|
|
5
|
-
This UDF
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
Generates document summaries using NVIDIA-hosted LLMs. This production UDF demonstrates how to extract the pipeline
|
|
6
|
+
payload, run custom code (summarization), and inject results into the metadata for downstream usecases (such as
|
|
7
|
+
retrieval).
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
|
|
10
10
|
- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
|
|
11
11
|
- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
12
|
+
- LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
|
|
13
|
+
- TIMEOUT: API timeout in seconds (default: 60)
|
|
14
|
+
- MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
|
|
15
|
+
- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
|
|
16
|
+
TODO: Implement this
|
|
17
|
+
- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
|
|
18
|
+
|
|
19
|
+
More info can be found in `examples/udfs/README.md`
|
|
16
20
|
"""
|
|
17
21
|
|
|
18
|
-
import os
|
|
19
22
|
import logging
|
|
20
|
-
|
|
23
|
+
import os
|
|
24
|
+
import time
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
PROMPT = """
|
|
30
|
+
Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
|
|
31
|
+
and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
|
|
32
|
+
This summary will be used for document search and understanding.
|
|
33
|
+
|
|
34
|
+
[CONTENT]
|
|
35
|
+
{content}
|
|
36
|
+
[END CONTENT]
|
|
37
|
+
"""
|
|
21
38
|
|
|
22
39
|
|
|
23
40
|
def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
|
|
@@ -27,13 +44,6 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
|
|
|
27
44
|
This function processes text primitives and generates concise summaries using
|
|
28
45
|
an LLM API, storing the results in the metadata's custom_content field.
|
|
29
46
|
|
|
30
|
-
Features:
|
|
31
|
-
- Flexible content detection across multiple metadata locations
|
|
32
|
-
- Robust error handling with graceful fallbacks
|
|
33
|
-
- Comprehensive logging for monitoring and debugging
|
|
34
|
-
- Configurable content length thresholds
|
|
35
|
-
- Safe metadata manipulation preserving existing data
|
|
36
|
-
|
|
37
47
|
Parameters
|
|
38
48
|
----------
|
|
39
49
|
control_message : IngestControlMessage
|
|
@@ -44,167 +54,152 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
|
|
|
44
54
|
IngestControlMessage
|
|
45
55
|
The modified control message with LLM summaries added to metadata
|
|
46
56
|
"""
|
|
47
|
-
from openai import OpenAI
|
|
48
|
-
|
|
49
|
-
logger = logging.getLogger(__name__)
|
|
50
57
|
logger.info("UDF: Starting LLM content summarization")
|
|
51
58
|
|
|
52
|
-
|
|
53
|
-
api_key = os.getenv("NVIDIA_API_KEY", "")
|
|
59
|
+
api_key = os.getenv("NVIDIA_API_KEY")
|
|
54
60
|
model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
|
|
55
61
|
base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
62
|
+
min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
|
|
63
|
+
max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
|
|
64
|
+
timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
|
|
65
|
+
|
|
66
|
+
stats = {
|
|
67
|
+
"skipped": False,
|
|
68
|
+
"failed": False,
|
|
69
|
+
"tokens": 0,
|
|
70
|
+
"duration": 0.0,
|
|
71
|
+
}
|
|
59
72
|
|
|
60
73
|
if not api_key:
|
|
61
|
-
logger.
|
|
74
|
+
logger.error("NVIDIA_API_KEY not set. Skipping...")
|
|
62
75
|
return control_message
|
|
63
76
|
|
|
64
|
-
# Get the DataFrame payload
|
|
65
77
|
df = control_message.payload()
|
|
66
|
-
if df is None or len(df) == 0:
|
|
67
|
-
logger.warning("No payload found in control message")
|
|
68
|
-
return control_message
|
|
69
|
-
|
|
70
|
-
logger.info(f"Processing {len(df)} rows for LLM summarization")
|
|
71
78
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
|
|
75
|
-
except Exception as e:
|
|
76
|
-
logger.error(f"Failed to initialize OpenAI client: {e}")
|
|
79
|
+
if df is None or df.empty:
|
|
80
|
+
logger.warning("No payload found. Nothing to summarize.")
|
|
77
81
|
return control_message
|
|
78
82
|
|
|
79
|
-
#
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
# Truncate if needed
|
|
100
|
-
if len(content) > max_content_length:
|
|
101
|
-
content = content[:max_content_length]
|
|
102
|
-
|
|
103
|
-
# Generate summary
|
|
104
|
-
summary = _generate_summary(client, content, model_name, logger)
|
|
83
|
+
# Select first and last chunk for summarization
|
|
84
|
+
# According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
|
|
85
|
+
# the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
|
|
86
|
+
# pages, it must require parsing the payload to see which chunks correspond to which pages
|
|
87
|
+
original_df = df.copy()
|
|
88
|
+
if len(df) > 1:
|
|
89
|
+
# TODO: add feature to select N first and last chunks
|
|
90
|
+
df = df.iloc[[0, -1]]
|
|
91
|
+
else:
|
|
92
|
+
logger.info("Document has only one chunk")
|
|
93
|
+
|
|
94
|
+
# Combine all content into a single string
|
|
95
|
+
content_list = df.apply(
|
|
96
|
+
_extract_content,
|
|
97
|
+
axis=1,
|
|
98
|
+
min_content_length=min_content_length,
|
|
99
|
+
max_content_length=max_content_length,
|
|
100
|
+
stats=stats,
|
|
101
|
+
)
|
|
102
|
+
content = " ".join(content_list)
|
|
105
103
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
_add_summary(df, idx, row, summary, model_name, logger)
|
|
109
|
-
stats["summarized"] += 1
|
|
110
|
-
else:
|
|
111
|
-
stats["failed"] += 1
|
|
104
|
+
# Nicely ask LLM to summarize content
|
|
105
|
+
summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
|
|
112
106
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
107
|
+
stats["failed"] = summary is None
|
|
108
|
+
if not stats["failed"]:
|
|
109
|
+
stats["tokens"] = _estimate_tokens(content)
|
|
110
|
+
logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
|
|
111
|
+
_store_summary(original_df, summary, model_name)
|
|
116
112
|
|
|
117
|
-
|
|
118
|
-
|
|
113
|
+
# Update the control message with modified DataFrame
|
|
114
|
+
control_message.payload(original_df)
|
|
119
115
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
f"{stats['skipped']} skipped, {stats['failed']} failed"
|
|
123
|
-
)
|
|
116
|
+
else:
|
|
117
|
+
logger.warning("%s failed to summarize content", model_name)
|
|
124
118
|
|
|
125
119
|
return control_message
|
|
126
120
|
|
|
127
121
|
|
|
128
|
-
def _extract_content(row,
|
|
129
|
-
"""Extract text content from row
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
# Try different locations for content
|
|
133
|
-
if isinstance(row.get("metadata"), dict):
|
|
134
|
-
metadata = row["metadata"]
|
|
135
|
-
|
|
136
|
-
# Primary location: metadata.content
|
|
137
|
-
content = metadata.get("content", "")
|
|
122
|
+
def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
|
|
123
|
+
"""Extract text content from row"""
|
|
124
|
+
metadata = row.get("metadata")
|
|
138
125
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
126
|
+
if isinstance(metadata, dict):
|
|
127
|
+
content = metadata.get("content")
|
|
128
|
+
if content is not None:
|
|
129
|
+
content = content.strip()
|
|
130
|
+
if len(content) < min_content_length:
|
|
131
|
+
stats["skipped"] = True
|
|
132
|
+
logger.warning(f"Content less than min={min_content_length}. Skipping...")
|
|
133
|
+
content = ""
|
|
134
|
+
elif len(content) > max_content_length:
|
|
135
|
+
logger.warning(f"Truncating content to {max_content_length} characters")
|
|
136
|
+
content = content[:max_content_length]
|
|
137
|
+
else:
|
|
138
|
+
stats["skipped"] = True
|
|
139
|
+
content = ""
|
|
148
140
|
|
|
149
|
-
|
|
150
|
-
|
|
141
|
+
else:
|
|
142
|
+
stats["skipped"] = True
|
|
143
|
+
logger.warning("No metadata found. Skipping...")
|
|
144
|
+
content = ""
|
|
151
145
|
|
|
152
146
|
return content
|
|
153
147
|
|
|
154
148
|
|
|
155
|
-
def
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
Summary:"""
|
|
149
|
+
def _generate_llm_summary(
|
|
150
|
+
content: str,
|
|
151
|
+
model_name: str,
|
|
152
|
+
base_url: str,
|
|
153
|
+
api_key: str,
|
|
154
|
+
timeout: int,
|
|
155
|
+
) -> tuple[str | None, float]:
|
|
156
|
+
"""Ask an LLM to summarize content extracted from doc."""
|
|
165
157
|
|
|
158
|
+
start_time = time.time()
|
|
166
159
|
try:
|
|
160
|
+
from openai import OpenAI
|
|
161
|
+
|
|
162
|
+
client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
|
|
163
|
+
start_time = time.time()
|
|
167
164
|
completion = client.chat.completions.create(
|
|
168
165
|
model=model_name,
|
|
169
|
-
messages=[{"role": "user", "content":
|
|
166
|
+
messages=[{"role": "user", "content": PROMPT.format(content=content)}],
|
|
170
167
|
max_tokens=400, # Increased for more comprehensive summaries
|
|
171
168
|
temperature=0.7,
|
|
172
169
|
)
|
|
170
|
+
duration = time.time() - start_time
|
|
173
171
|
|
|
174
|
-
if completion.choices
|
|
172
|
+
if completion.choices:
|
|
175
173
|
summary = completion.choices[0].message.content.strip()
|
|
176
|
-
return summary
|
|
177
|
-
|
|
178
|
-
return None
|
|
174
|
+
return summary, duration
|
|
175
|
+
return None, duration
|
|
179
176
|
|
|
180
177
|
except Exception as e:
|
|
181
178
|
logger.error(f"API call failed: {e}")
|
|
182
|
-
|
|
179
|
+
# TODO: GitHub Thread
|
|
180
|
+
# Reviewers, tell me if this is a bad idea.
|
|
181
|
+
# I think the convention is to return timestamp for time even if it fails
|
|
182
|
+
return None, time.time() - start_time
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
def
|
|
186
|
-
"""Add summary to metadata
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
existing_metadata = row.get("metadata")
|
|
190
|
-
if existing_metadata is not None and isinstance(existing_metadata, dict):
|
|
191
|
-
metadata = dict(existing_metadata) # Create a copy
|
|
192
|
-
else:
|
|
193
|
-
metadata = {}
|
|
185
|
+
def _store_summary(df, summary: str, model_name: str):
|
|
186
|
+
"""Add summary to metadata and store in df"""
|
|
187
|
+
# hardcoded heuristic to store everything on chunk 0's metadata
|
|
188
|
+
row_0 = df.iloc[0]
|
|
194
189
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
190
|
+
# this is a reference to a dictionary that is stored in the dataframe
|
|
191
|
+
# and is modified in place
|
|
192
|
+
metadata = row_0.get("metadata")
|
|
198
193
|
|
|
199
|
-
|
|
200
|
-
metadata["custom_content"]
|
|
194
|
+
if metadata.get("custom_content") is None:
|
|
195
|
+
metadata["custom_content"] = {}
|
|
196
|
+
metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
|
|
201
197
|
|
|
202
|
-
# Update the DataFrame at the specific index
|
|
203
|
-
try:
|
|
204
|
-
df.at[idx, "metadata"] = metadata
|
|
205
|
-
except Exception:
|
|
206
|
-
# Alternative approach: update the original row reference
|
|
207
|
-
df.iloc[idx]["metadata"] = metadata
|
|
208
198
|
|
|
209
|
-
|
|
210
|
-
|
|
199
|
+
def _estimate_tokens(text: str) -> int:
|
|
200
|
+
"""Rough estimate (~4 characters per token)"""
|
|
201
|
+
return len(text) // 4
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _safe_model_name(name: str) -> str:
|
|
205
|
+
return name.replace("/", "__").replace("-", "_")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|