nv-ingest-api 2025.5.23.dev20250523__tar.gz → 2025.5.25.dev20250525__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (164) hide show
  1. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api.egg-info → nv_ingest_api-2025.5.25.dev20250525}/PKG-INFO +1 -1
  2. nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  3. nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  4. nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  5. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  6. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api.egg-info/SOURCES.txt +3 -0
  7. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/LICENSE +0 -0
  8. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/MANIFEST.in +0 -0
  9. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/README.md +0 -0
  10. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/pyproject.toml +0 -0
  11. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/setup.cfg +0 -0
  12. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/__init__.py +0 -0
  13. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/__init__.py +0 -0
  14. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/extract.py +0 -0
  15. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/mutate.py +0 -0
  16. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/store.py +0 -0
  17. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/transform.py +0 -0
  18. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/interface/utility.py +0 -0
  19. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/__init__.py +0 -0
  20. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  21. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/enums/common.py +0 -0
  22. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  23. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  24. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  25. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  26. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  27. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  28. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  29. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  30. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  31. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/extract/image → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/extract/html}/__init__.py +0 -0
  32. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/extract/image/image_helpers → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/extract/image}/__init__.py +0 -0
  33. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  34. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  35. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/extract/pdf → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/extract/image/image_helpers}/__init__.py +0 -0
  36. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  37. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  38. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  39. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/mutate → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/extract/pdf}/__init__.py +0 -0
  40. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  41. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  42. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  43. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  44. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  45. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  46. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  47. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  48. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  49. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  50. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  51. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  52. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  53. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/primitives/nim/model_interface → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/mutate}/__init__.py +0 -0
  54. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  55. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  56. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  57. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  58. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  59. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  60. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  61. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/primitives/nim/model_interface}/__init__.py +0 -0
  62. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  63. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  64. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  65. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  66. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  67. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -0
  68. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  69. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  70. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  71. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  72. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  73. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  74. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  75. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  76. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  77. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  78. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas/extract → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas}/__init__.py +0 -0
  79. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas/meta → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/extract}/__init__.py +0 -0
  80. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  81. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  82. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  83. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  84. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  85. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  86. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  87. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  88. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  89. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  90. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  91. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  92. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas/mutate → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/meta}/__init__.py +0 -0
  93. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  94. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
  95. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  96. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas/store → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/mutate}/__init__.py +0 -0
  97. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  98. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/schemas/transform → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/store}/__init__.py +0 -0
  99. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  100. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  101. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/store → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/schemas/transform}/__init__.py +0 -0
  102. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  103. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  104. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  105. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  106. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/internal/transform → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/store}/__init__.py +0 -0
  107. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  108. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  109. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/util → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/internal/transform}/__init__.py +0 -0
  110. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  111. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  112. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  113. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/util/message_brokers → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/util}/__init__.py +0 -0
  114. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  115. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  116. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  117. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  118. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/containers.py +0 -0
  119. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  120. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  121. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/formats.py +0 -0
  122. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  123. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  124. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/detectors/language.py +0 -0
  125. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  126. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  127. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  128. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  129. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  130. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  131. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  132. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  133. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  134. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  135. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  136. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  137. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  138. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/util/schema → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/util/message_brokers}/__init__.py +0 -0
  139. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  140. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  141. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  142. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  143. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  144. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  145. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  146. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  147. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  148. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  149. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  150. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/util/service_clients → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/util/schema}/__init__.py +0 -0
  151. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  152. {nv_ingest_api-2025.5.23.dev20250523/src/nv_ingest_api/util/service_clients/redis → nv_ingest_api-2025.5.25.dev20250525/src/nv_ingest_api/util/service_clients}/__init__.py +0 -0
  153. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  154. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  155. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  156. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  157. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  158. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  159. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/system/__init__.py +0 -0
  160. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  161. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  162. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  163. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  164. {nv_ingest_api-2025.5.23.dev20250523 → nv_ingest_api-2025.5.25.dev20250525}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.23.dev20250523
3
+ Version: 2025.5.25.dev20250525
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import uuid
8
+ from typing import Optional, Dict, Any, Union, Tuple, List
9
+
10
+ import pandas as pd
11
+
12
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
14
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
15
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
16
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
+
18
+ from markitdown.converters import HtmlConverter
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @unified_exception_handler
24
+ def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
25
+ metadata = row.get("metadata")
26
+ html_content = row.get("content")
27
+
28
+ if html_content:
29
+ html_converter = HtmlConverter()
30
+ md_content = html_converter.convert_string(html_content=html_content).text_content
31
+ metadata["content"] = md_content
32
+
33
+ return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
34
+
35
+
36
+ def extract_markdown_from_html_internal(
37
+ df_extraction_ledger: pd.DataFrame,
38
+ task_config: Dict[str, Any],
39
+ extraction_config: HtmlExtractorSchema,
40
+ execution_trace_log: Optional[Dict[str, Any]] = None,
41
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
42
+ """
43
+ Processes a pandas DataFrame containing HTML file content, extracting html as text from
44
+ each document and converting it to markdown.
45
+
46
+ Parameters
47
+ ----------
48
+ df_extraction_ledger : pd.DataFrame
49
+ The input DataFrame containing html files as raw text. Expected columns include
50
+ 'source_id' and 'content'.
51
+ task_config : Union[Dict[str, Any], BaseModel]
52
+ Configuration instructions for the document processing task. This can be provided as a
53
+ dictionary or a Pydantic model.
54
+ extraction_config : Any
55
+ A configuration object for document extraction that guides the extraction process.
56
+ execution_trace_log : Optional[Dict[str, Any]], default=None
57
+ An optional dictionary containing trace information for debugging or logging.
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ A DataFrame with the original html content converted to markdown. The resulting
63
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
64
+
65
+ Raises
66
+ ------
67
+ Exception
68
+ If an error occurs during the document extraction process, the exception is logged and
69
+ re-raised.
70
+ """
71
+
72
+ # Apply the decode_and_extract function to each row in the DataFrame.
73
+ sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
74
+
75
+ # Explode any list results and drop missing values.
76
+ sr_extraction = sr_extraction.explode().dropna()
77
+
78
+ # Convert the extraction results to a DataFrame if available.
79
+ if not sr_extraction.empty:
80
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
81
+ else:
82
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
83
+
84
+ return extracted_df, {}
@@ -0,0 +1,34 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class HtmlExtractorSchema(BaseModel):
14
+ """
15
+ Configuration schema for the Html extractor settings.
16
+
17
+ Parameters
18
+ ----------
19
+ max_queue_size : int, default=1
20
+ The maximum number of items allowed in the processing queue.
21
+
22
+ n_workers : int, default=16
23
+ The number of worker threads to use for processing.
24
+
25
+ raise_on_failure : bool, default=False
26
+ A flag indicating whether to raise an exception on processing failure.
27
+
28
+ """
29
+
30
+ max_queue_size: int = 1
31
+ n_workers: int = 16
32
+ raise_on_failure: bool = False
33
+
34
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.23.dev20250523
3
+ Version: 2025.5.25.dev20250525
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -27,6 +27,8 @@ src/nv_ingest_api/internal/extract/docx/engines/__init__.py
27
27
  src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py
28
28
  src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py
29
29
  src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py
30
+ src/nv_ingest_api/internal/extract/html/__init__.py
31
+ src/nv_ingest_api/internal/extract/html/html_extractor.py
30
32
  src/nv_ingest_api/internal/extract/image/__init__.py
31
33
  src/nv_ingest_api/internal/extract/image/chart_extractor.py
32
34
  src/nv_ingest_api/internal/extract/image/image_extractor.py
@@ -78,6 +80,7 @@ src/nv_ingest_api/internal/schemas/extract/__init__.py
78
80
  src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py
79
81
  src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py
80
82
  src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py
83
+ src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py
81
84
  src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py
82
85
  src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py
83
86
  src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py