nv-ingest-api 2025.5.17.dev20250517__tar.gz → 2025.5.19.dev20250519__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. {nv_ingest_api-2025.5.17.dev20250517/src/nv_ingest_api.egg-info → nv_ingest_api-2025.5.19.dev20250519}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
  3. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +170 -171
  4. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/transform/split_text.py +9 -3
  5. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  6. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/LICENSE +0 -0
  7. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/MANIFEST.in +0 -0
  8. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/README.md +0 -0
  9. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/pyproject.toml +0 -0
  10. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/setup.cfg +0 -0
  11. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/__init__.py +0 -0
  12. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/__init__.py +0 -0
  13. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/extract.py +0 -0
  14. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/mutate.py +0 -0
  15. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/store.py +0 -0
  16. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/transform.py +0 -0
  17. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/interface/utility.py +0 -0
  18. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/__init__.py +0 -0
  19. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  20. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/enums/common.py +0 -0
  21. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  22. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  23. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  24. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  25. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  26. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  27. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  28. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  29. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  30. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  31. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  32. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  33. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  34. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  35. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  36. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  37. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  38. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  39. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  40. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  41. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  42. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  43. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  44. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  45. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  46. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  47. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  48. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  49. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  50. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  51. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  52. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  53. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  54. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  55. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  56. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  57. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  58. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  59. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  60. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  61. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  62. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  63. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -0
  64. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  65. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  66. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  67. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  68. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  69. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  70. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  71. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  72. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  73. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  74. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  75. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  76. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  77. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  78. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  79. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  80. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  81. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  82. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  83. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  84. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  85. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  86. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  87. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  88. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  89. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  90. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
  91. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  92. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  93. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  94. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  95. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  96. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  97. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  98. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  99. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  100. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  101. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  102. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  103. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  104. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  105. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  106. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  107. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  108. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/__init__.py +0 -0
  109. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  110. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  111. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  112. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  113. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/containers.py +0 -0
  114. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  115. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  116. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/formats.py +0 -0
  117. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  118. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  119. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/detectors/language.py +0 -0
  120. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  121. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  122. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  123. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  124. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  125. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  126. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  127. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  128. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  129. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  130. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  131. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  132. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  133. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  134. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  135. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  136. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  137. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  138. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  139. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  140. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  141. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  142. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  143. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  144. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  145. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  146. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  147. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  148. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  149. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  150. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  151. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  152. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  153. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  154. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  155. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/system/__init__.py +0 -0
  156. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  157. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
  158. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  159. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  160. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  161. {nv_ingest_api-2025.5.17.dev20250517 → nv_ingest_api-2025.5.19.dev20250519}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.17.dev20250517
3
+ Version: 2025.5.19.dev20250519
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -274,59 +274,70 @@ class DocxReader:
274
274
  - A list of extracted images from the paragraph.
275
275
  """
276
276
 
277
- paragraph_images = []
278
- if self.paragraph_format == "text":
279
- paragraph_text = paragraph.text
280
- else:
281
- # Get the default style of the paragraph, "markdown"
277
+ try:
278
+ paragraph_images = []
279
+ if self.paragraph_format == "text":
280
+ return paragraph.text.strip(), paragraph_images
281
+
282
282
  font = paragraph.style.font
283
283
  default_style = (font.bold, font.italic, font.underline)
284
284
 
285
- # Iterate over the runs of the paragraph and group them by style, excluding empty runs
286
285
  paragraph_text = ""
287
286
  group_text = ""
288
287
  previous_style = None
289
288
 
290
289
  for c in paragraph.iter_inner_content():
291
- if isinstance(c, Hyperlink):
292
- text = f"[{c.text}]({c.address})"
293
- style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
- elif isinstance(c, Run):
295
- text = c.text
296
- style = (c.bold, c.italic, c.underline)
297
- # 1. Locate the inline shape which is stored in the <w:drawing> element.
298
- # 2. r:embed in <a.blip> has the relationship id for extracting the file where
299
- # the image is stored as bytes.
300
- # Reference:
301
- # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
302
- inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
303
- for r_id in inline_shapes:
304
- text += self.image_tag.format(self.image_tag_index)
305
- self.image_tag_index += 1
306
- image = paragraph.part.related_parts[r_id].image
307
- paragraph_images.append(image)
308
- else:
309
- continue
310
-
311
- style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
312
-
313
- # If the style changes for a non empty text, format the previous group and start a new one
314
- if (not self.is_text_empty(text)) and (previous_style is not None):
315
- if style != previous_style:
290
+ try:
291
+ if isinstance(c, Hyperlink):
292
+ text = f"[{c.text}]({c.address})"
293
+ style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
294
+ elif isinstance(c, Run):
295
+ text = c.text
296
+ style = (c.bold, c.italic, c.underline)
297
+
298
+ # 1. Locate the inline shape which is stored in the <w:drawing> element.
299
+ # 2. r:embed in <a.blip> has the relationship id for extracting the file where
300
+ # the image is stored as bytes.
301
+ # Reference:
302
+ # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
303
+ inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
304
+ for r_id in inline_shapes:
305
+ text += self.image_tag.format(self.image_tag_index)
306
+ self.image_tag_index += 1
307
+ try:
308
+ image = paragraph.part.related_parts[r_id].image
309
+ paragraph_images.append(image)
310
+ except Exception as img_e:
311
+ logger.warning(
312
+ "Failed to extract image with rId " "%s: %s -- object / file may be malformed",
313
+ r_id,
314
+ img_e,
315
+ )
316
+ else:
317
+ continue
318
+
319
+ style = tuple(s if s is not None else d for s, d in zip(style, default_style))
320
+
321
+ if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
316
322
  paragraph_text += self.format_text(group_text, *previous_style)
317
323
  group_text = ""
318
324
 
319
- group_text += text
320
- if not self.is_text_empty(text):
321
- previous_style = style
325
+ group_text += text
326
+ if not self.is_text_empty(text):
327
+ previous_style = style
322
328
 
323
- # Format the last group
324
- if group_text:
325
- paragraph_text += self.format_text(group_text, *style)
329
+ except Exception as e:
330
+ logger.error("format_paragraph: failed to process run: %s", e)
331
+ continue
332
+
333
+ if group_text and previous_style:
334
+ paragraph_text += self.format_text(group_text, *previous_style)
335
+
336
+ return paragraph_text.strip(), paragraph_images
326
337
 
327
- # Remove trailing spaces
328
- paragraph_text = paragraph_text.strip()
329
- return paragraph_text, paragraph_images
338
+ except Exception as e:
339
+ logger.error("format_paragraph: failed for paragraph: %s", e)
340
+ return "", []
330
341
 
331
342
  def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
332
343
  """
@@ -344,12 +355,23 @@ class DocxReader:
344
355
  - A list of images extracted from the cell.
345
356
  """
346
357
 
347
- if self.paragraph_format == "markdown":
348
- newline = "<br>"
349
- else:
350
- newline = "\n"
351
- paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
352
- return newline.join(paragraph_texts), paragraph_images
358
+ try:
359
+ newline = "<br>" if self.paragraph_format == "markdown" else "\n"
360
+ texts, images = [], []
361
+
362
+ for p in cell.paragraphs:
363
+ try:
364
+ t, imgs = self.format_paragraph(p)
365
+ texts.append(t)
366
+ images.extend(imgs)
367
+ except Exception as e:
368
+ logger.error("format_cell: failed to format paragraph in cell: %s", e)
369
+
370
+ return newline.join(texts), images
371
+
372
+ except Exception as e:
373
+ logger.error("format_cell: failed entirely: %s", e)
374
+ return "", []
353
375
 
354
376
  def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
355
377
  """
@@ -368,25 +390,50 @@ class DocxReader:
368
390
  - A DataFrame representation of the table's content.
369
391
  """
370
392
 
371
- rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
372
- texts = [[text for text, _ in row] for row in rows]
373
- table_images = [image for row in rows for _, images in row for image in images]
374
-
375
- table = pd.DataFrame(texts[1:], columns=texts[0])
376
- if "markdown" in self.table_format:
377
- table_text = table.to_markdown(index=False)
378
- if self.table_format == "markdown_light":
379
- table_text = re.sub(r"\s{2,}", " ", table_text)
380
- table_text = re.sub(r"-{2,}", "-", table_text)
381
- elif self.table_format == "csv":
382
- table_text = table.to_csv()
383
- elif self.table_format == "tag":
384
- table_text = self.table_tag.format(self.table_tag_index)
385
- self.table_tag_index += 1
386
- else:
387
- raise ValueError(f"Unknown table format {format}")
393
+ try:
394
+ rows_data = []
395
+ all_images = []
396
+
397
+ for row in table.rows:
398
+ row_texts = []
399
+ row_images = []
400
+ for cell in row.cells:
401
+ try:
402
+ cell_text, cell_imgs = self.format_cell(cell)
403
+ row_texts.append(cell_text)
404
+ row_images.extend(cell_imgs)
405
+ except Exception as e:
406
+ logger.error("format_table: failed to process cell: %s", e)
407
+ row_texts.append("") # pad for column alignment
408
+
409
+ rows_data.append(row_texts)
410
+ all_images.extend(row_images)
411
+
412
+ if not rows_data or not rows_data[0]:
413
+ return None, [], pd.DataFrame()
414
+
415
+ header = rows_data[0]
416
+ body = rows_data[1:]
417
+ df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
418
+
419
+ if "markdown" in self.table_format:
420
+ table_text = df.to_markdown(index=False)
421
+ if self.table_format == "markdown_light":
422
+ table_text = re.sub(r"\s{2,}", " ", table_text)
423
+ table_text = re.sub(r"-{2,}", "-", table_text)
424
+ elif self.table_format == "csv":
425
+ table_text = df.to_csv(index=False)
426
+ elif self.table_format == "tag":
427
+ table_text = self.table_tag.format(self.table_tag_index)
428
+ self.table_tag_index += 1
429
+ else:
430
+ raise ValueError(f"Unknown table format {self.table_format}")
431
+
432
+ return table_text, all_images, df
388
433
 
389
- return table_text, table_images, table
434
+ except Exception as e:
435
+ logger.error("format_table: failed to format table: %s", e)
436
+ return None, [], pd.DataFrame()
390
437
 
391
438
  @staticmethod
392
439
  def apply_text_style(style: str, text: str, level: int = 0) -> str:
@@ -841,30 +888,39 @@ class DocxReader:
841
888
  self._prev_para_image_idx = 0
842
889
 
843
890
  para_idx = 0
844
-
845
891
  for child in self.document.element.body.iterchildren():
846
- if isinstance(child, CT_P):
847
- paragraph = Paragraph(child, self.document)
848
- paragraph_text, paragraph_images = self.format_paragraph(paragraph)
849
-
850
- if extract_text:
851
- self._extract_para_text(
852
- paragraph,
853
- paragraph_text,
854
- base_unified_metadata,
855
- text_depth,
856
- para_idx,
857
- )
858
-
859
- if (extract_charts or extract_images or extract_tables) and paragraph_images:
860
- self._prev_para_images = paragraph_images
861
- self._prev_para_image_idx = para_idx
862
- self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
863
- self.images += paragraph_images
892
+ try:
893
+ if isinstance(child, CT_P):
894
+ paragraph = Paragraph(child, self.document)
895
+ paragraph_text, paragraph_images = self.format_paragraph(paragraph)
896
+
897
+ if extract_text:
898
+ try:
899
+ self._extract_para_text(
900
+ paragraph,
901
+ paragraph_text,
902
+ base_unified_metadata,
903
+ text_depth,
904
+ para_idx,
905
+ )
906
+ except Exception as e:
907
+ logger.error("extract_data: _extract_para_text failed: %s", e)
908
+
909
+ if (extract_images or extract_charts or extract_tables) and paragraph_images:
910
+ self._pending_images += [
911
+ (image, para_idx, "", base_unified_metadata) for image in paragraph_images
912
+ ]
913
+ self.images.extend(paragraph_images)
914
+
915
+ elif isinstance(child, CT_Tbl):
916
+ if extract_tables or extract_charts:
917
+ try:
918
+ self._extract_table_data(child, base_unified_metadata)
919
+ except Exception as e:
920
+ logger.error("extract_data: _extract_table_data failed: %s", e)
864
921
 
865
- elif isinstance(child, CT_Tbl):
866
- if extract_tables or extract_charts:
867
- self._extract_table_data(child, base_unified_metadata)
922
+ except Exception as e:
923
+ logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
868
924
 
869
925
  para_idx += 1
870
926