nv-ingest-api 2025.4.17.dev20250417__tar.gz → 2025.4.19.dev20250419__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (160) hide show
  1. {nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api.egg-info → nv_ingest_api-2025.4.19.dev20250419}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/primitives/control_message_task.py +0 -4
  3. {nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/primitives/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api.egg-info/SOURCES.txt +14 -0
  6. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/__init__.py +0 -3
  7. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  21. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  22. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  23. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/__init__.py +0 -3
  24. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  25. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  26. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  27. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  28. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  29. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  30. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  31. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  32. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  33. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  34. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  35. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  36. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  37. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  38. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  39. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  40. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  41. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  43. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  44. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/mutate/__init__.py +0 -3
  45. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  46. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/mutate/filter.py +0 -133
  47. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  48. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  49. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  50. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  51. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  52. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  53. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  54. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  55. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  56. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  57. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  58. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  59. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  60. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  61. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  62. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  63. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  64. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  65. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  66. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  67. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/__init__.py +0 -3
  68. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  69. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  70. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  71. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  72. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  73. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  74. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  75. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  76. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  77. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  78. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  79. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  80. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  81. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  82. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  83. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  84. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  85. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  86. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  87. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  88. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  89. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  90. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  91. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  92. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  93. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  94. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  95. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/store/__init__.py +0 -3
  96. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  97. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/store/image_upload.py +0 -232
  98. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/transform/__init__.py +0 -3
  99. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/transform/caption_image.py +0 -205
  100. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/transform/embed_text.py +0 -496
  101. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/transform/split_text.py +0 -157
  102. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/__init__.py +0 -0
  103. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  104. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/control_message/validators.py +0 -47
  105. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/__init__.py +0 -0
  106. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/bytetools.py +0 -78
  107. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/containers.py +0 -65
  108. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/datetools.py +0 -90
  109. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/dftools.py +0 -127
  110. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/formats.py +0 -64
  111. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/converters/type_mappings.py +0 -27
  112. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/detectors/__init__.py +0 -5
  113. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/detectors/language.py +0 -38
  114. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  115. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/converters.py +0 -72
  116. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  117. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  118. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  119. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  120. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/image_processing/__init__.py +0 -5
  121. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/image_processing/clustering.py +0 -260
  122. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/image_processing/processing.py +0 -179
  123. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  124. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/image_processing/transforms.py +0 -407
  125. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/logging/__init__.py +0 -0
  126. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/logging/configuration.py +0 -31
  127. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/message_brokers/__init__.py +0 -3
  128. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  129. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  130. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  131. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  132. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/metadata/__init__.py +0 -5
  133. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/metadata/aggregators.py +0 -469
  134. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/multi_processing/__init__.py +0 -8
  135. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  136. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/nim/__init__.py +0 -56
  137. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/pdf/__init__.py +0 -3
  138. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/pdf/pdfium.py +0 -427
  139. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/schema/__init__.py +0 -0
  140. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/schema/schema_validator.py +0 -10
  141. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/__init__.py +0 -3
  142. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/client_base.py +0 -72
  143. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  144. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  145. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  146. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  147. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
  148. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/util/string_processing/__init__.py +0 -51
  149. nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api.egg-info/SOURCES.txt +0 -157
  150. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/LICENSE +0 -0
  151. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/MANIFEST.in +0 -0
  152. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/README.md +0 -0
  153. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/pyproject.toml +0 -0
  154. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/setup.cfg +0 -0
  155. {nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/__init__.py +0 -0
  156. {nv_ingest_api-2025.4.17.dev20250417/src/nv_ingest_api/internal/extract/docx/engines → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api/primitives}/__init__.py +0 -0
  157. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  158. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  159. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  160. {nv_ingest_api-2025.4.17.dev20250417 → nv_ingest_api-2025.4.19.dev20250419}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.4.17.dev20250417
3
+ Version: 2025.4.19.dev20250419
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -1,7 +1,3 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
1
  from uuid import UUID
6
2
 
7
3
  from pydantic import BaseModel, Field, ConfigDict
@@ -1,7 +1,3 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
1
  import copy
6
2
  import re
7
3
  from datetime import datetime
@@ -10,7 +6,8 @@ import logging
10
6
  import pandas as pd
11
7
  from typing import Any, Dict, Generator, Union
12
8
 
13
- from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
9
+ from nv_ingest_api.primitives.control_message_task import ControlMessageTask
10
+
14
11
 
15
12
  logger = logging.getLogger(__name__)
16
13
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.4.17.dev20250417
3
+ Version: 2025.4.19.dev20250419
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,14 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ src/version.py
6
+ src/nv_ingest_api/__init__.py
7
+ src/nv_ingest_api.egg-info/PKG-INFO
8
+ src/nv_ingest_api.egg-info/SOURCES.txt
9
+ src/nv_ingest_api.egg-info/dependency_links.txt
10
+ src/nv_ingest_api.egg-info/requires.txt
11
+ src/nv_ingest_api.egg-info/top_level.txt
12
+ src/nv_ingest_api/primitives/__init__.py
13
+ src/nv_ingest_api/primitives/control_message_task.py
14
+ src/nv_ingest_api/primitives/ingest_control_message.py
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,215 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- import functools
7
- import inspect
8
- import pprint
9
- from typing import Dict, Any, Optional, List
10
-
11
- from pydantic import BaseModel
12
-
13
- from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema, NemoRetrieverParseConfigSchema
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- ## CONFIG_SCHEMAS is a global dictionary that maps extraction methods to Pydantic schemas.
18
- CONFIG_SCHEMAS: Dict[str, Any] = {
19
- "adobe": PDFiumConfigSchema,
20
- "llama": PDFiumConfigSchema,
21
- "nemoretriever_parse": NemoRetrieverParseConfigSchema,
22
- "pdfium": PDFiumConfigSchema,
23
- "tika": PDFiumConfigSchema,
24
- "unstructured_io": PDFiumConfigSchema,
25
- }
26
-
27
-
28
- def _build_config_from_schema(schema_class: type[BaseModel], args: Dict[str, Any]) -> Dict[str, Any]:
29
- """
30
- Build and validate a configuration dictionary from the provided arguments using a Pydantic schema.
31
-
32
- This function filters the supplied arguments to include only those keys defined in the given
33
- Pydantic schema (using Pydantic v2's `model_fields`), instantiates the schema for validation,
34
- and returns the validated configuration as a dictionary.
35
-
36
- Parameters
37
- ----------
38
- schema_class : type[BaseModel]
39
- The Pydantic BaseModel subclass used for validating the configuration.
40
- args : dict
41
- A dictionary of arguments from which to extract and validate configuration data.
42
-
43
- Returns
44
- -------
45
- dict
46
- A dictionary containing the validated configuration data as defined by the schema.
47
-
48
- Raises
49
- ------
50
- pydantic.ValidationError
51
- If the provided arguments do not conform to the schema.
52
- """
53
- field_names = schema_class.model_fields.keys()
54
- config_data = {k: v for k, v in args.items() if k in field_names}
55
- # Instantiate the schema to perform validation, then return the model's dictionary representation.
56
-
57
- return schema_class(**config_data).dict()
58
-
59
-
60
- def extraction_interface_relay_constructor(api_fn, task_keys: Optional[List[str]] = None):
61
- """
62
- Decorator for constructing and validating configuration using Pydantic schemas.
63
-
64
- This decorator wraps a user-facing interface function. It extracts common task parameters
65
- (using the provided task_keys, or defaults if not specified) and method-specific configuration
66
- parameters based on a required 'extract_method' keyword argument. It then uses the corresponding
67
- Pydantic schema (from the global CONFIG_SCHEMAS registry) to validate and build a method-specific
68
- configuration. The resulting composite configuration, along with the extraction ledger and
69
- execution trace log, is passed to the backend API function.
70
-
71
- Parameters
72
- ----------
73
- api_fn : callable
74
- The backend API function that will be called with the extraction ledger, the task configuration
75
- dictionary, the extractor configuration, and the execution trace log. This function must conform
76
- to the signature:
77
-
78
- extract_primitives_from_pdf_internal(df_extraction_ledger: pd.DataFrame,
79
- task_config: Dict[str, Any],
80
- extractor_config: Any,
81
- execution_trace_log: Optional[List[Any]] = None)
82
- task_keys : list of str, optional
83
- A list of keyword names that should be extracted from the user function as common task parameters.
84
- If not provided, defaults to ["extract_text", "extract_images", "extract_tables", "extract_charts"].
85
-
86
- Returns
87
- -------
88
- callable
89
- A wrapped function that builds and validates the configuration before invoking the backend API function.
90
-
91
- Raises
92
- ------
93
- ValueError
94
- If the extraction method specified is not supported (i.e., no corresponding Pydantic schema exists
95
- in CONFIG_SCHEMAS), if api_fn does not conform to the expected signature, or if the required
96
- 'extract_method' parameter is not provided.
97
- """
98
- # Verify that api_fn conforms to the expected signature.
99
- try:
100
- # Try binding four arguments: ledger, task_config, extractor_config, and execution_trace_log.
101
- inspect.signature(api_fn).bind("dummy_ledger", {"dummy": True}, {"dummy": True}, {})
102
- except TypeError as e:
103
- raise ValueError(
104
- "api_fn must conform to the signature: "
105
- "extract_primitives_from_pdf(df_extraction_ledger, task_config, extractor_config, execution_trace_log)"
106
- ) from e
107
-
108
- if task_keys is None:
109
- task_keys = []
110
-
111
- def decorator(user_fn):
112
- @functools.wraps(user_fn)
113
- def wrapper(*args, **kwargs):
114
- # Use bind_partial so that missing required arguments can be handled gracefully.
115
- sig = inspect.signature(user_fn)
116
- bound = sig.bind_partial(*args, **kwargs)
117
- bound.apply_defaults()
118
-
119
- # The first parameter is assumed to be the extraction ledger.
120
- param_names = list(sig.parameters.keys())
121
- if param_names[0] not in bound.arguments:
122
- raise ValueError("Missing required ledger argument.")
123
- ledger = bound.arguments[param_names[0]]
124
-
125
- # Process reserved 'execution_trace_log'.
126
- execution_trace_log = bound.arguments.get("execution_trace_log", None)
127
- if execution_trace_log is None:
128
- execution_trace_log = {} # Replace None with an empty dict.
129
- if "execution_trace_log" in bound.arguments:
130
- del bound.arguments["execution_trace_log"]
131
-
132
- # Ensure that 'extract_method' is provided.
133
- if "extract_method" not in bound.arguments or bound.arguments["extract_method"] is None:
134
- raise ValueError("The 'extract_method' parameter is required.")
135
- extract_method = bound.arguments["extract_method"]
136
- del bound.arguments["extract_method"]
137
-
138
- # Extract common task parameters using the specified task_keys.
139
- task_params = {key: bound.arguments[key] for key in task_keys if key in bound.arguments}
140
- task_params["extract_method"] = extract_method
141
- task_config = {"params": task_params}
142
-
143
- # Look up the appropriate Pydantic schema.
144
- schema_class = CONFIG_SCHEMAS.get(extract_method)
145
- if schema_class is None:
146
- raise ValueError(f"Unsupported extraction method: {extract_method}")
147
-
148
- # Build the method-specific configuration using the schema class.
149
- extraction_config_dict = _build_config_from_schema(schema_class, bound.arguments)
150
-
151
- # Create a Pydantic object instead of a dictionary for the specific extractor config
152
- extractor_schema = None
153
- try:
154
- # Find the appropriate extractor schema class based on the extraction method
155
- extractor_schema_name = f"{extract_method.capitalize()}ExtractorSchema"
156
- extractor_schema_class = globals().get(extractor_schema_name)
157
-
158
- if extractor_schema_class is None:
159
- # Try another common naming pattern
160
- extractor_schema_name = f"{extract_method.upper()}ExtractorSchema"
161
- extractor_schema_class = globals().get(extractor_schema_name)
162
-
163
- if extractor_schema_class is None:
164
- # Final fallback attempt with camelCase
165
- extractor_schema_name = f"{extract_method[0].upper() + extract_method[1:]}ExtractorSchema"
166
- extractor_schema_class = globals().get(extractor_schema_name)
167
-
168
- if extractor_schema_class is not None:
169
- # Create the extractor schema with the method-specific config
170
- config_key = f"{extract_method}_config"
171
- extractor_schema = extractor_schema_class(**{config_key: extraction_config_dict})
172
- else:
173
- logger.warning(f"Could not find extractor schema class for method: {extract_method}")
174
- except Exception as e:
175
- logger.warning(f"Error creating extractor schema: {str(e)}")
176
- # Fall back to dictionary approach if schema creation fails
177
- extractor_schema = {f"{extract_method}_config": extraction_config_dict}
178
-
179
- # If schema creation failed, fall back to dictionary
180
- if extractor_schema is None:
181
- extractor_schema = {f"{extract_method}_config": extraction_config_dict}
182
-
183
- # Log the task and extractor configurations for debugging
184
- logger.debug("\n" + "=" * 80)
185
- logger.debug(f"DEBUG - API Function: {api_fn.__name__}")
186
- logger.debug(f"DEBUG - Extract Method: {extract_method}")
187
- logger.debug("-" * 80)
188
-
189
- # Format the task config as a string and log it
190
- task_config_str = pprint.pformat(task_config, width=100, sort_dicts=False)
191
- logger.debug(f"DEBUG - Task Config:\n{task_config_str}")
192
- logger.debug("-" * 80)
193
-
194
- # Format the extractor config as a string and log it
195
- if hasattr(extractor_schema, "model_dump"):
196
- extractor_config_str = pprint.pformat(extractor_schema.model_dump(), width=100, sort_dicts=False)
197
- else:
198
- extractor_config_str = pprint.pformat(extractor_schema, width=100, sort_dicts=False)
199
- logger.debug(f"DEBUG - Extractor Config Type: {type(extractor_schema)}")
200
- logger.debug(f"DEBUG - Extractor Config:\n{extractor_config_str}")
201
- logger.debug("=" * 80 + "\n")
202
-
203
- # Call the backend API function.
204
- pprint.pprint(task_config)
205
- pprint.pprint(extractor_schema)
206
- result = api_fn(ledger, task_config, extractor_schema, execution_trace_log)
207
-
208
- # If the result is a tuple, return only the first element
209
- if isinstance(result, tuple):
210
- return result[0]
211
- return result
212
-
213
- return wrapper
214
-
215
- return decorator