nv-ingest-api 2025.10.8.dev20251008__tar.gz → 2025.10.9.dev20251009__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (181) hide show
  1. {nv_ingest_api-2025.10.8.dev20251008/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.9.dev20251009}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
  3. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +44 -8
  4. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  5. nv_ingest_api-2025.10.9.dev20251009/src/udfs/llm_summarizer_udf.py +204 -0
  6. nv_ingest_api-2025.10.8.dev20251008/src/udfs/llm_summarizer_udf.py +0 -210
  7. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/LICENSE +0 -0
  8. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/MANIFEST.in +0 -0
  9. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/README.md +0 -0
  10. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/pyproject.toml +0 -0
  11. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/setup.cfg +0 -0
  12. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/__init__.py +0 -0
  13. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/__init__.py +0 -0
  14. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/extract.py +0 -0
  15. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/mutate.py +0 -0
  16. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/store.py +0 -0
  17. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/transform.py +0 -0
  18. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/utility.py +0 -0
  19. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/__init__.py +0 -0
  20. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  21. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/enums/common.py +0 -0
  22. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  23. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  24. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  25. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  26. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  27. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  28. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  29. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  30. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  31. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  32. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  33. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  34. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  35. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  36. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  37. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  38. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  39. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  40. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  41. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  42. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  43. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  44. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  45. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  46. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  47. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  48. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  49. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  50. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  51. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  52. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  53. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
  54. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/meta/udf.py +0 -0
  55. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  56. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  57. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  58. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  59. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  60. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  61. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  62. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  63. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  64. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  65. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  66. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  67. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  68. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  69. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
  70. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  71. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  72. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  73. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  74. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  75. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  76. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  77. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  78. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  79. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  80. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  81. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  82. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  83. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  84. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  85. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  86. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  87. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  88. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  89. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  90. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  91. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  92. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  93. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  94. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  95. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  96. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
  97. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  98. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
  99. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  100. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  101. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  102. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  103. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  104. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  105. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  106. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  107. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  108. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  109. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  110. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  111. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  112. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  113. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  114. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  115. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  116. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/__init__.py +0 -0
  117. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  118. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  119. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  120. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  121. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/containers.py +0 -0
  122. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  123. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  124. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/formats.py +0 -0
  125. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  126. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/dataloader/__init__.py +0 -0
  127. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/dataloader/dataloader.py +0 -0
  128. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  129. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/detectors/language.py +0 -0
  130. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  131. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  132. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  133. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  134. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  135. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  136. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  137. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  138. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  139. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  140. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  141. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/__init__.py +0 -0
  142. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
  143. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
  144. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
  145. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
  146. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
  147. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  148. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  149. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
  150. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  151. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  152. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  153. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  154. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  155. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  156. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  157. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  158. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  159. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  160. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  161. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  162. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  163. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  164. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  165. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  166. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  167. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  168. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  169. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  170. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  171. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  172. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
  173. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
  174. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/system/__init__.py +0 -0
  175. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  176. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
  177. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  178. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  179. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  180. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/udfs/__init__.py +0 -0
  181. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.9.dev20251009}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.9.dev20251009
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -332,6 +332,7 @@ def _extract_page_elements(
332
332
 
333
333
  # Process each extracted element based on extraction flags
334
334
  for page_idx, page_element in page_element_results:
335
+ page_reading_index = page_idx + 1
335
336
  # Skip elements that shouldn't be extracted based on flags
336
337
  if (not extract_tables) and (page_element.type_string == "table"):
337
338
  continue
@@ -347,7 +348,7 @@ def _extract_page_elements(
347
348
  # Construct metadata for the page element
348
349
  page_element_meta = construct_page_element_metadata(
349
350
  page_element,
350
- page_idx,
351
+ page_reading_index,
351
352
  page_count,
352
353
  source_metadata,
353
354
  base_unified_metadata,
@@ -473,6 +474,7 @@ def pdfium_extractor(
473
474
  for page_idx in range(page_count):
474
475
  page = doc.get_page(page_idx)
475
476
  page_width, page_height = page.get_size()
477
+ page_reading_index = page_idx + 1
476
478
 
477
479
  # Text extraction
478
480
  if extract_text:
@@ -481,7 +483,7 @@ def pdfium_extractor(
481
483
  text_meta = construct_text_metadata(
482
484
  [page_text],
483
485
  pdf_metadata.keywords,
484
- page_idx,
486
+ page_reading_index,
485
487
  -1,
486
488
  -1,
487
489
  -1,
@@ -499,7 +501,7 @@ def pdfium_extractor(
499
501
  image_data = _extract_page_images(
500
502
  extract_images_method,
501
503
  page,
502
- page_idx,
504
+ page_reading_index,
503
505
  page_width,
504
506
  page_height,
505
507
  page_count,
@@ -518,7 +520,7 @@ def pdfium_extractor(
518
520
  base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
519
521
  image_meta = construct_image_metadata_from_base64(
520
522
  base64_image,
521
- page_idx,
523
+ page_reading_index,
522
524
  page_count,
523
525
  source_metadata,
524
526
  base_unified_metadata,
@@ -326,16 +326,52 @@ class NimClient:
326
326
 
327
327
  outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
328
328
 
329
- response = self.client.infer(
330
- model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
331
- )
329
+ base_delay = 0.5
330
+ attempt = 0
331
+ retries_429 = 0
332
+ max_grpc_retries = self.max_429_retries
332
333
 
333
- logger.debug(f"gRPC inference response: {response}")
334
+ while attempt < self.max_retries:
335
+ try:
336
+ response = self.client.infer(
337
+ model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
338
+ )
334
339
 
335
- if len(outputs) == 1:
336
- return response.as_numpy(outputs[0].name())
337
- else:
338
- return [response.as_numpy(output.name()) for output in outputs]
340
+ logger.debug(f"gRPC inference response: {response}")
341
+
342
+ if len(outputs) == 1:
343
+ return response.as_numpy(outputs[0].name())
344
+ else:
345
+ return [response.as_numpy(output.name()) for output in outputs]
346
+
347
+ except grpcclient.InferenceServerException as e:
348
+ status = e.status()
349
+ if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
350
+ retries_429 += 1
351
+ logger.warning(
352
+ f"Received gRPC {status} for model '{model_name}'. "
353
+ f"Attempt {retries_429} of {max_grpc_retries}."
354
+ )
355
+ if retries_429 >= max_grpc_retries:
356
+ logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
357
+ raise
358
+
359
+ backoff_time = base_delay * (2**retries_429)
360
+ time.sleep(backoff_time)
361
+ continue
362
+
363
+ else:
364
+ # For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
365
+ # retrying will not help. We should fail fast.
366
+ logger.error(
367
+ f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
368
+ )
369
+ raise
370
+
371
+ except Exception as e:
372
+ # Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
373
+ logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
374
+ raise
339
375
 
340
376
  def _http_infer(self, formatted_input: dict) -> dict:
341
377
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.9.dev20251009
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Content Summarizer UDF for NV-Ingest Pipeline
4
+
5
+ This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
6
+ for enhanced downstream processing and search capabilities.
7
+
8
+ These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
9
+ - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
10
+ - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
11
+ - LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
12
+ - TIMEOUT: API timeout in seconds (default: 60)
13
+ - MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
14
+ - MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
15
+ TODO: Implement this
16
+ - NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
17
+ """
18
+
19
+ import logging
20
+ import os
21
+ import time
22
+
23
+ # REMOVE BEFORE MERGING
24
+ # import yaml
25
+ # from pathlib import Path
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ PROMPT = """
31
+ Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
32
+ and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
33
+ This summary will be used for document search and understanding.
34
+
35
+ [CONTENT]
36
+ {content}
37
+ [END CONTENT]
38
+ """
39
+
40
+
41
+ def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
42
+ """
43
+ UDF function that adds LLM-generated summaries to text content chunks.
44
+
45
+ This function processes text primitives and generates concise summaries using
46
+ an LLM API, storing the results in the metadata's custom_content field.
47
+
48
+ Parameters
49
+ ----------
50
+ control_message : IngestControlMessage
51
+ The control message containing the DataFrame payload with text content
52
+
53
+ Returns
54
+ -------
55
+ IngestControlMessage
56
+ The modified control message with LLM summaries added to metadata
57
+ """
58
+ logger.info("UDF: Starting LLM content summarization")
59
+
60
+ api_key = os.getenv("NVIDIA_API_KEY")
61
+ model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
62
+ base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
63
+ min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
64
+ max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
65
+ timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
66
+
67
+ stats = {
68
+ "skipped": False,
69
+ "failed": False,
70
+ "tokens": 0,
71
+ "duration": 0.0,
72
+ }
73
+
74
+ if not api_key:
75
+ logger.error("NVIDIA_API_KEY not set. Skipping...")
76
+ return control_message
77
+
78
+ df = control_message.payload()
79
+
80
+ if df is None or df.empty:
81
+ logger.warning("No payload found. Nothing to summarize.")
82
+ return control_message
83
+
84
+ # Select first and last chunk for summarization
85
+ # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
86
+ # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
87
+ # pages, it must require parsing the payload to see which chunks correspond to which pages
88
+ if len(df) > 1:
89
+ # TODO: add feature to select N first and last chunks
90
+ df = df.iloc[[0, -1]]
91
+ else:
92
+ logger.info("Document has only one chunk")
93
+
94
+ # Combine all content into a single string
95
+ content_list = df.apply(
96
+ _extract_content,
97
+ axis=1,
98
+ min_content_length=min_content_length,
99
+ max_content_length=max_content_length,
100
+ stats=stats,
101
+ )
102
+ content = " ".join(content_list)
103
+
104
+ # Nicely ask LLM to summarize content
105
+ summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
106
+
107
+ stats["failed"] = summary is None
108
+ if not stats["failed"]:
109
+ stats["tokens"] = _estimate_tokens(content)
110
+ logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
111
+ _store_summary(df, summary, model_name)
112
+
113
+ # Update the control message with modified DataFrame
114
+ control_message.payload(df)
115
+ else:
116
+ logger.warning("%s failed to summarize content", model_name)
117
+
118
+ return control_message
119
+
120
+
121
+ def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
122
+ """Extract text content from row"""
123
+ metadata = row.get("metadata")
124
+
125
+ if isinstance(metadata, dict):
126
+ content = metadata.get("content")
127
+ if content is not None:
128
+ content = content.strip()
129
+ if len(content) < min_content_length:
130
+ stats["skipped"] = True
131
+ logger.warning(f"Content less than min={min_content_length}. Skipping...")
132
+ content = ""
133
+ elif len(content) > max_content_length:
134
+ logger.warning(f"Truncating content to {max_content_length} characters")
135
+ content = content[:max_content_length]
136
+ else:
137
+ stats["skipped"] = True
138
+ content = ""
139
+
140
+ else:
141
+ stats["skipped"] = True
142
+ logger.warning("No metadata found. Skipping...")
143
+ content = ""
144
+
145
+ return content
146
+
147
+
148
+ def _generate_llm_summary(
149
+ content: str,
150
+ model_name: str,
151
+ base_url: str,
152
+ api_key: str,
153
+ timeout: int,
154
+ ) -> tuple[str | None, float]:
155
+ """Ask an LLM to summarize content extracted from doc."""
156
+
157
+ start_time = time.time()
158
+ try:
159
+ from openai import OpenAI
160
+
161
+ client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
162
+ start_time = time.time()
163
+ completion = client.chat.completions.create(
164
+ model=model_name,
165
+ messages=[{"role": "user", "content": PROMPT.format(content=content)}],
166
+ max_tokens=400, # Increased for more comprehensive summaries
167
+ temperature=0.7,
168
+ )
169
+ duration = time.time() - start_time
170
+
171
+ if completion.choices:
172
+ summary = completion.choices[0].message.content.strip()
173
+ return summary, duration
174
+ return None, duration
175
+
176
+ except Exception as e:
177
+ logger.error(f"API call failed: {e}")
178
+ # TODO: GitHub Thread
179
+ # Reviewers, tell me if this is a bad idea.
180
+ # I think the convention is to return timestamp for time even if it fails
181
+ return None, time.time() - start_time
182
+
183
+
184
+ def _store_summary(df, summary: str, model_name: str):
185
+ """Add summary to metadata and store in df"""
186
+ # hardcoded heuristic to store everything on chunk 0's metadata
187
+ row_0 = df.iloc[0]
188
+
189
+ # this is a reference to a dictionary that is stored in the dataframe
190
+ # and is modified in place
191
+ metadata = row_0.get("metadata")
192
+
193
+ if metadata.get("custom_content") is None:
194
+ metadata["custom_content"] = {}
195
+ metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
196
+
197
+
198
+ def _estimate_tokens(text: str) -> int:
199
+ """Rough estimate (~4 characters per token)"""
200
+ return len(text) // 4
201
+
202
+
203
+ def _safe_model_name(name: str) -> str:
204
+ return name.replace("/", "__").replace("-", "_")
@@ -1,210 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- LLM Content Summarizer UDF for NV-Ingest Pipeline
4
-
5
- This UDF uses an LLM API to generate concise summaries
6
- of text content chunks, adding AI-generated summaries to the metadata for
7
- enhanced downstream processing and search capabilities.
8
-
9
- Environment Variables:
10
- - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
11
- - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
12
- - LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
13
- - LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
14
- - LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
15
- - LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
16
- """
17
-
18
- import os
19
- import logging
20
- from typing import Optional
21
-
22
-
23
- def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
24
- """
25
- UDF function that adds LLM-generated summaries to text content chunks.
26
-
27
- This function processes text primitives and generates concise summaries using
28
- an LLM API, storing the results in the metadata's custom_content field.
29
-
30
- Features:
31
- - Flexible content detection across multiple metadata locations
32
- - Robust error handling with graceful fallbacks
33
- - Comprehensive logging for monitoring and debugging
34
- - Configurable content length thresholds
35
- - Safe metadata manipulation preserving existing data
36
-
37
- Parameters
38
- ----------
39
- control_message : IngestControlMessage
40
- The control message containing the DataFrame payload with text content
41
-
42
- Returns
43
- -------
44
- IngestControlMessage
45
- The modified control message with LLM summaries added to metadata
46
- """
47
- from openai import OpenAI
48
-
49
- logger = logging.getLogger(__name__)
50
- logger.info("UDF: Starting LLM content summarization")
51
-
52
- # Get configuration from environment
53
- api_key = os.getenv("NVIDIA_API_KEY", "")
54
- model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
55
- base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
56
- timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
57
- min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
58
- max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
59
-
60
- if not api_key:
61
- logger.warning("NVIDIA_API_KEY not found, skipping summarization")
62
- return control_message
63
-
64
- # Get the DataFrame payload
65
- df = control_message.payload()
66
- if df is None or len(df) == 0:
67
- logger.warning("No payload found in control message")
68
- return control_message
69
-
70
- logger.info(f"Processing {len(df)} rows for LLM summarization")
71
-
72
- # Initialize OpenAI client with error handling
73
- try:
74
- client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
75
- except Exception as e:
76
- logger.error(f"Failed to initialize OpenAI client: {e}")
77
- return control_message
78
-
79
- # Stats for reporting
80
- stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
81
-
82
- # Process each row
83
- for idx, row in df.iterrows():
84
- stats["processed"] += 1
85
-
86
- try:
87
- # Extract content - be more flexible about where it comes from
88
- content = _extract_content(row, logger)
89
-
90
- if not content:
91
- stats["skipped"] += 1
92
- continue
93
-
94
- content = content.strip()
95
- if len(content) < min_content_length:
96
- stats["skipped"] += 1
97
- continue
98
-
99
- # Truncate if needed
100
- if len(content) > max_content_length:
101
- content = content[:max_content_length]
102
-
103
- # Generate summary
104
- summary = _generate_summary(client, content, model_name, logger)
105
-
106
- if summary:
107
- # Add to metadata
108
- _add_summary(df, idx, row, summary, model_name, logger)
109
- stats["summarized"] += 1
110
- else:
111
- stats["failed"] += 1
112
-
113
- except Exception as e:
114
- stats["failed"] += 1
115
- logger.error(f"Row {idx}: Error processing content: {e}")
116
-
117
- # Update the control message with modified DataFrame
118
- control_message.payload(df)
119
-
120
- logger.info(
121
- f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
122
- f"{stats['skipped']} skipped, {stats['failed']} failed"
123
- )
124
-
125
- return control_message
126
-
127
-
128
- def _extract_content(row, logger) -> Optional[str]:
129
- """Extract text content from row, trying multiple locations."""
130
- content = ""
131
-
132
- # Try different locations for content
133
- if isinstance(row.get("metadata"), dict):
134
- metadata = row["metadata"]
135
-
136
- # Primary location: metadata.content
137
- content = metadata.get("content", "")
138
-
139
- # If no content, try other locations
140
- if not content:
141
- # Try in text_metadata
142
- text_metadata = metadata.get("text_metadata", {})
143
- content = text_metadata.get("text", "") or text_metadata.get("content", "")
144
-
145
- # Try top-level content field
146
- if not content:
147
- content = row.get("content", "")
148
-
149
- if not content:
150
- return None
151
-
152
- return content
153
-
154
-
155
- def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
156
- """Generate summary with robust error handling."""
157
- prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
158
-
159
- {content}
160
-
161
- Focus on the main purpose, key topics, and important details.
162
- This summary will be used for document search and understanding.
163
-
164
- Summary:"""
165
-
166
- try:
167
- completion = client.chat.completions.create(
168
- model=model_name,
169
- messages=[{"role": "user", "content": prompt}],
170
- max_tokens=400, # Increased for more comprehensive summaries
171
- temperature=0.7,
172
- )
173
-
174
- if completion.choices and len(completion.choices) > 0:
175
- summary = completion.choices[0].message.content.strip()
176
- return summary
177
- else:
178
- return None
179
-
180
- except Exception as e:
181
- logger.error(f"API call failed: {e}")
182
- return None
183
-
184
-
185
- def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
186
- """Add summary to metadata with safe handling."""
187
- try:
188
- # Get current metadata or create new dict - handle None case properly
189
- existing_metadata = row.get("metadata")
190
- if existing_metadata is not None and isinstance(existing_metadata, dict):
191
- metadata = dict(existing_metadata) # Create a copy
192
- else:
193
- metadata = {}
194
-
195
- # Ensure custom_content exists
196
- if "custom_content" not in metadata or metadata["custom_content"] is None:
197
- metadata["custom_content"] = {}
198
-
199
- # Add LLM summary
200
- metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
201
-
202
- # Update the DataFrame at the specific index
203
- try:
204
- df.at[idx, "metadata"] = metadata
205
- except Exception:
206
- # Alternative approach: update the original row reference
207
- df.iloc[idx]["metadata"] = metadata
208
-
209
- except Exception as e:
210
- logger.error(f"Failed to add summary to row {idx}: {e}")