nv-ingest-api 2025.8.13.dev20250813__tar.gz → 2025.8.15.dev20250815__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api.egg-info → nv_ingest_api-2025.8.15.dev20250815}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/enums/common.py +37 -0
  3. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_extractor.py +5 -1
  4. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/meta/udf.py +232 -0
  5. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +63 -22
  6. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +102 -15
  7. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +40 -4
  8. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  9. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/embed_text.py +5 -0
  10. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/decorators.py +104 -156
  11. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/imports/callable_signatures.py +108 -0
  12. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +53 -5
  13. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection/class_inspect.py +145 -0
  14. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection/function_inspect.py +65 -0
  15. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/logging/configuration.py +102 -0
  16. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/service_clients/__init__.py +3 -0
  17. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  18. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/string_processing/configuration.py +682 -0
  19. nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/string_processing/yaml.py +45 -0
  20. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/system/hardware_info.py +178 -13
  21. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  22. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/SOURCES.txt +8 -0
  23. nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/imports/callable_signatures.py +0 -50
  24. nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/logging/configuration.py +0 -38
  25. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/LICENSE +0 -0
  26. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/MANIFEST.in +0 -0
  27. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/README.md +0 -0
  28. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/pyproject.toml +0 -0
  29. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/setup.cfg +0 -0
  30. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/__init__.py +0 -0
  31. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/__init__.py +0 -0
  32. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/extract.py +0 -0
  33. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/mutate.py +0 -0
  34. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/store.py +0 -0
  35. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/transform.py +0 -0
  36. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/utility.py +0 -0
  37. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/__init__.py +0 -0
  38. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  39. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  40. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  41. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  42. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  43. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  44. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  45. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  46. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  47. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  48. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  49. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  50. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  51. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  52. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  53. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  54. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  55. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  56. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  57. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  58. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  59. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  60. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  61. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  62. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  63. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  64. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  65. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  66. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  67. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  68. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  69. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  70. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/mutate → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/meta}/__init__.py +0 -0
  71. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/primitives/nim/model_interface → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/mutate}/__init__.py +0 -0
  72. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  73. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  74. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  75. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  76. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  77. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  78. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/primitives/nim/model_interface}/__init__.py +0 -0
  79. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  80. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  81. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  82. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  83. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  84. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
  85. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  86. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  87. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  88. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  89. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  90. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  91. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  92. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  93. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  94. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/extract → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas}/__init__.py +0 -0
  95. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/meta → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/extract}/__init__.py +0 -0
  96. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  97. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  98. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  99. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  100. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  101. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  102. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  103. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  104. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  105. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  106. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  107. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  108. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  109. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/mutate → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/meta}/__init__.py +0 -0
  110. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  111. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  112. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/store → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/mutate}/__init__.py +0 -0
  113. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  114. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/transform → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/store}/__init__.py +0 -0
  115. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  116. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  117. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/store → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/transform}/__init__.py +0 -0
  118. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  119. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  120. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  121. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  122. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/transform → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/store}/__init__.py +0 -0
  123. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  124. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  125. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/transform}/__init__.py +0 -0
  126. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  127. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  128. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/imports → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util}/__init__.py +0 -0
  129. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  130. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  131. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  132. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  133. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/containers.py +0 -0
  134. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  135. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  136. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/formats.py +0 -0
  137. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  138. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  139. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/detectors/language.py +0 -0
  140. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  141. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  142. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  143. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  144. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  145. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  146. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  147. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  148. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  149. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  150. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/message_brokers → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/imports}/__init__.py +0 -0
  151. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/schema → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection}/__init__.py +0 -0
  152. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  153. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/service_clients → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/message_brokers}/__init__.py +0 -0
  154. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  155. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  156. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  157. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  158. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  159. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  160. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  161. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  162. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  163. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  164. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  165. {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/service_clients/redis → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/schema}/__init__.py +0 -0
  166. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  167. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  168. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  169. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  170. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  171. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  172. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  173. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/system/__init__.py +0 -0
  174. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  175. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  176. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  177. {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.8.13.dev20250813
3
+ Version: 2025.8.15.dev20250815
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -386,6 +386,40 @@ class StatusEnum(str, Enum):
386
386
  SUCCESS: str = "success"
387
387
 
388
388
 
389
+ class PipelinePhase(int, Enum):
390
+ """
391
+ The logical phase of a pipeline stage.
392
+
393
+ Attributes
394
+ ----------
395
+ PRE_PROCESSING : int
396
+ Pre-processing phase.
397
+ EXTRACTION : int
398
+ Extraction phase.
399
+ POST_PROCESSING : int
400
+ Post-processing phase.
401
+ MUTATION : int
402
+ Mutation phase.
403
+ TRANSFORM : int
404
+ Transform phase.
405
+ RESPONSE : int
406
+ Response phase.
407
+ TELEMETRY : int
408
+ Telemetry phase.
409
+ DRAIN : int
410
+ Drain phase.
411
+ """
412
+
413
+ PRE_PROCESSING = 0
414
+ EXTRACTION = 1
415
+ POST_PROCESSING = 2
416
+ MUTATION = 3
417
+ TRANSFORM = 4
418
+ RESPONSE = 5
419
+ TELEMETRY = 6
420
+ DRAIN = 7
421
+
422
+
389
423
  class TableFormatEnum(str, Enum):
390
424
  """
391
425
  Enum for representing table formats.
@@ -446,6 +480,8 @@ class TaskTypeEnum(str, Enum):
446
480
  Represents a task for extracting chart data.
447
481
  INFOGRAPHIC_DATA_EXTRACT : str
448
482
  Represents a task for extracting infographic data.
483
+ UDF : str
484
+ Represents a user-defined function task.
449
485
  """
450
486
 
451
487
  AUDIO_DATA_EXTRACT: str = "audio_data_extract"
@@ -460,6 +496,7 @@ class TaskTypeEnum(str, Enum):
460
496
  STORE_EMBEDDING: str = "store_embedding"
461
497
  STORE: str = "store"
462
498
  TABLE_DATA_EXTRACT: str = "table_data_extract"
499
+ UDF: str = "udf"
463
500
  VDB_UPLOAD: str = "vdb_upload"
464
501
 
465
502
 
@@ -108,8 +108,12 @@ def _decode_and_extract_from_image(
108
108
  f"decode_and_extract: Extracting image content using image_extraction_config: "
109
109
  f"{validated_extraction_config}"
110
110
  )
111
+ # Ensure we pass the correct nested config type (ImageConfigSchema) to helpers.
112
+ # Some callers provide the full ImageExtractorSchema; extract its inner image_extraction_config.
111
113
  if validated_extraction_config is not None:
112
- extract_params["image_extraction_config"] = validated_extraction_config
114
+ inner_cfg = getattr(validated_extraction_config, "image_extraction_config", validated_extraction_config)
115
+ if inner_cfg is not None:
116
+ extract_params["image_extraction_config"] = inner_cfg
113
117
 
114
118
  if execution_trace_log is not None:
115
119
  extract_params["trace_info"] = execution_trace_log
@@ -0,0 +1,232 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import hashlib
6
+ import inspect
7
+ import logging
8
+ import time
9
+ from typing import Any, Dict, List, Optional
10
+ from dataclasses import dataclass
11
+
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
13
+ from nv_ingest_api.internal.schemas.meta.udf import UDFStageSchema
14
+ from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class CachedUDF:
21
+ """Cached UDF function with metadata"""
22
+
23
+ function: callable
24
+ function_name: str
25
+ signature_validated: bool
26
+ created_at: float
27
+ last_used: float
28
+ use_count: int
29
+
30
+
31
+ class UDFCache:
32
+ """LRU cache for compiled and validated UDF functions"""
33
+
34
+ def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
35
+ self.max_size = max_size
36
+ self.ttl_seconds = ttl_seconds
37
+ self.cache: Dict[str, CachedUDF] = {}
38
+ self.access_order: List[str] = [] # For LRU tracking
39
+
40
+ def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
41
+ """Generate cache key from UDF string and function name"""
42
+ content = f"{udf_function_str.strip()}:{udf_function_name}"
43
+ return hashlib.sha256(content.encode()).hexdigest()
44
+
45
+ def _evict_lru(self):
46
+ """Remove least recently used item"""
47
+ if self.access_order:
48
+ lru_key = self.access_order.pop(0)
49
+ self.cache.pop(lru_key, None)
50
+
51
+ def _cleanup_expired(self):
52
+ """Remove expired entries if TTL is configured"""
53
+ if not self.ttl_seconds:
54
+ return
55
+
56
+ current_time = time.time()
57
+ expired_keys = [
58
+ key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
59
+ ]
60
+
61
+ for key in expired_keys:
62
+ self.cache.pop(key, None)
63
+ if key in self.access_order:
64
+ self.access_order.remove(key)
65
+
66
+ def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
67
+ """Get cached UDF function if available"""
68
+ self._cleanup_expired()
69
+
70
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
71
+
72
+ if cache_key in self.cache:
73
+ # Update access tracking
74
+ if cache_key in self.access_order:
75
+ self.access_order.remove(cache_key)
76
+ self.access_order.append(cache_key)
77
+
78
+ # Update usage stats
79
+ cached_udf = self.cache[cache_key]
80
+ cached_udf.last_used = time.time()
81
+ cached_udf.use_count += 1
82
+
83
+ return cached_udf
84
+
85
+ return None
86
+
87
+ def put(
88
+ self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
89
+ ) -> str:
90
+ """Cache a compiled and validated UDF function"""
91
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
92
+
93
+ # Evict LRU if at capacity
94
+ while len(self.cache) >= self.max_size:
95
+ self._evict_lru()
96
+
97
+ current_time = time.time()
98
+ cached_udf = CachedUDF(
99
+ function=function,
100
+ function_name=udf_function_name,
101
+ signature_validated=signature_validated,
102
+ created_at=current_time,
103
+ last_used=current_time,
104
+ use_count=1,
105
+ )
106
+
107
+ self.cache[cache_key] = cached_udf
108
+ self.access_order.append(cache_key)
109
+
110
+ return cache_key
111
+
112
+ def get_stats(self) -> Dict[str, Any]:
113
+ """Get cache statistics"""
114
+ total_uses = sum(udf.use_count for udf in self.cache.values())
115
+ most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
116
+ return {
117
+ "size": len(self.cache),
118
+ "max_size": self.max_size,
119
+ "total_uses": total_uses,
120
+ "most_used_function": most_used.function_name if most_used else None,
121
+ "most_used_count": most_used.use_count if most_used else 0,
122
+ }
123
+
124
+
125
+ # Global cache instance
126
+ _udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
127
+
128
+
129
+ def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
130
+ """Compile and validate UDF function (extracted for caching)"""
131
+ # Execute the UDF function string in a controlled namespace
132
+ namespace: Dict[str, Any] = {}
133
+ try:
134
+ exec(udf_function_str, namespace)
135
+ except Exception as e:
136
+ raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
137
+
138
+ # Extract the specified function from the namespace
139
+ if udf_function_name in namespace and callable(namespace[udf_function_name]):
140
+ udf_function = namespace[udf_function_name]
141
+ else:
142
+ raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
143
+
144
+ # Validate the UDF function signature
145
+ try:
146
+ ingest_callable_signature(inspect.signature(udf_function))
147
+ except Exception as e:
148
+ raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
149
+
150
+ return udf_function
151
+
152
+
153
+ def get_udf_cache_stats() -> Dict[str, Any]:
154
+ """Get UDF cache performance statistics"""
155
+ return _udf_cache.get_stats()
156
+
157
+
158
+ def udf_stage_callable_fn(control_message: IngestControlMessage, stage_config: UDFStageSchema) -> IngestControlMessage:
159
+ """
160
+ UDF stage callable function that processes UDF tasks in a control message.
161
+
162
+ This function extracts all UDF tasks from the control message and executes them sequentially.
163
+
164
+ Parameters
165
+ ----------
166
+ control_message : IngestControlMessage
167
+ The control message containing UDF tasks to process
168
+ stage_config : UDFStageSchema
169
+ Configuration for the UDF stage
170
+
171
+ Returns
172
+ -------
173
+ IngestControlMessage
174
+ The control message after processing all UDF tasks
175
+ """
176
+ logger.debug("Starting UDF stage processing")
177
+
178
+ # Extract all UDF tasks from control message using free function
179
+ try:
180
+ all_task_configs = remove_all_tasks_by_type(control_message, "udf")
181
+ except ValueError:
182
+ # No UDF tasks found
183
+ if stage_config.ignore_empty_udf:
184
+ logger.debug("No UDF tasks found, ignoring as configured")
185
+ return control_message
186
+ else:
187
+ raise ValueError("No UDF tasks found in control message")
188
+
189
+ # Process each UDF task sequentially
190
+ for task_num, task_config in enumerate(all_task_configs, 1):
191
+ logger.debug(f"Processing UDF task {task_num} of {len(all_task_configs)}")
192
+
193
+ # Get UDF function string and function name from task properties
194
+ udf_function_str = task_config.get("udf_function", "").strip()
195
+ udf_function_name = task_config.get("udf_function_name", "").strip()
196
+
197
+ # Skip empty UDF functions if configured to ignore them
198
+ if not udf_function_str:
199
+ if stage_config.ignore_empty_udf:
200
+ logger.debug(f"UDF task {task_num} has empty function, skipping as configured")
201
+ continue
202
+ else:
203
+ raise ValueError(f"UDF task {task_num} has empty function string")
204
+
205
+ # Validate that function name is provided
206
+ if not udf_function_name:
207
+ raise ValueError(f"UDF task {task_num} missing required 'udf_function_name' property")
208
+
209
+ # Check if UDF function is cached
210
+ cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
211
+ if cached_udf:
212
+ udf_function = cached_udf.function
213
+ else:
214
+ # Compile and validate UDF function
215
+ udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_num)
216
+ # Cache the compiled UDF function
217
+ _udf_cache.put(udf_function_str, udf_function_name, udf_function)
218
+
219
+ # Execute the UDF function with the control message
220
+ try:
221
+ control_message = udf_function(control_message)
222
+ except Exception as e:
223
+ raise ValueError(f"UDF task {task_num} execution failed: {str(e)}")
224
+
225
+ # Validate that the UDF function returned an IngestControlMessage
226
+ if not isinstance(control_message, IngestControlMessage):
227
+ raise ValueError(f"UDF task {task_num} must return an IngestControlMessage, got {type(control_message)}")
228
+
229
+ logger.debug(f"UDF task {task_num} completed successfully")
230
+
231
+ logger.debug(f"UDF stage processing completed. Processed {len(all_task_configs)} UDF tasks")
232
+ return control_message
@@ -5,10 +5,11 @@
5
5
  import copy
6
6
  import re
7
7
  from datetime import datetime
8
+ from collections import defaultdict
9
+ from typing import Any, Dict, Generator, List, Optional, Union
8
10
 
9
11
  import logging
10
12
  import pandas as pd
11
- from typing import Any, Dict, Generator, Union
12
13
 
13
14
  from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
14
15
 
@@ -55,6 +56,52 @@ def remove_task_by_type(ctrl_msg, task: str):
55
56
  return removed_task.properties
56
57
 
57
58
 
59
+ def remove_all_tasks_by_type(ctrl_msg, task: str):
60
+ """
61
+ Remove all tasks from the control message by matching their type.
62
+
63
+ This function iterates over the tasks in the control message, finds all tasks
64
+ whose type matches the provided task string, removes them, and returns their
65
+ properties as a list.
66
+
67
+ Parameters
68
+ ----------
69
+ ctrl_msg : IngestControlMessage
70
+ The control message from which to remove the tasks.
71
+ task : str
72
+ The task type to remove.
73
+
74
+ Returns
75
+ -------
76
+ list[dict]
77
+ A list of dictionaries of properties for all removed tasks.
78
+
79
+ Raises
80
+ ------
81
+ ValueError
82
+ If no tasks with the given type are found.
83
+ """
84
+ matching_tasks = []
85
+
86
+ # Find all tasks with matching type
87
+ for t in ctrl_msg.get_tasks():
88
+ if t.type == task:
89
+ matching_tasks.append(t)
90
+
91
+ if not matching_tasks:
92
+ err_msg = f"process_control_message: No tasks of type '{task}' found in control message."
93
+ logger.error(err_msg)
94
+ raise ValueError(err_msg)
95
+
96
+ # Remove all matching tasks and collect their properties
97
+ removed_task_properties = []
98
+ for task_obj in matching_tasks:
99
+ removed_task = ctrl_msg.remove_task(task_obj.id)
100
+ removed_task_properties.append(removed_task.properties)
101
+
102
+ return removed_task_properties
103
+
104
+
58
105
  class IngestControlMessage:
59
106
  """
60
107
  A control message class for ingesting tasks and managing associated metadata,
@@ -65,47 +112,41 @@ class IngestControlMessage:
65
112
  """
66
113
  Initialize a new IngestControlMessage instance.
67
114
  """
68
- self._tasks: Dict[str, ControlMessageTask] = {}
115
+ self._tasks: Dict[str, List[ControlMessageTask]] = defaultdict(list)
69
116
  self._metadata: Dict[str, Any] = {}
70
117
  self._timestamps: Dict[str, datetime] = {}
71
- self._payload: pd.DataFrame = pd.DataFrame()
118
+ self._payload: Optional[pd.DataFrame] = None
72
119
  self._config: Dict[str, Any] = {}
73
120
 
74
121
  def add_task(self, task: ControlMessageTask):
75
122
  """
76
- Add a task to the control message, keyed by the task's unique 'id'.
77
-
78
- Raises
79
- ------
80
- ValueError
81
- If a task with the same 'id' already exists.
123
+ Add a task to the control message. Multiple tasks with the same ID are supported.
82
124
  """
83
- if task.id in self._tasks:
84
- raise ValueError(f"Task with id '{task.id}' already exists. Tasks must be unique.")
85
- self._tasks[task.id] = task
125
+ self._tasks[task.id].append(task)
86
126
 
87
127
  def get_tasks(self) -> Generator[ControlMessageTask, None, None]:
88
128
  """
89
129
  Return all tasks as a generator.
90
130
  """
91
- yield from self._tasks.values()
131
+ for task_list in self._tasks.values():
132
+ yield from task_list
92
133
 
93
134
  def has_task(self, task_id: str) -> bool:
94
135
  """
95
- Check if a task with the given ID exists.
136
+ Check if any tasks with the given ID exist.
96
137
  """
97
- return task_id in self._tasks
138
+ return task_id in self._tasks and len(self._tasks[task_id]) > 0
98
139
 
99
140
  def remove_task(self, task_id: str) -> ControlMessageTask:
100
141
  """
101
- Remove a task from the control message. Logs a warning if the task does not exist.
142
+ Remove the first task with the given ID. Warns if no task exists.
102
143
  """
103
- if task_id in self._tasks:
104
- _task = self._tasks[task_id]
105
-
106
- del self._tasks[task_id]
107
-
108
- return _task
144
+ if task_id in self._tasks and self._tasks[task_id]:
145
+ task = self._tasks[task_id].pop(0)
146
+ # Clean up empty lists
147
+ if not self._tasks[task_id]:
148
+ del self._tasks[task_id]
149
+ return task
109
150
  else:
110
151
  raise RuntimeError(f"Attempted to remove non-existent task with id: {task_id}")
111
152
 
@@ -5,24 +5,30 @@
5
5
 
6
6
  import functools
7
7
  import inspect
8
+ import logging
8
9
  import string
9
10
  from datetime import datetime
11
+ from typing import Optional
10
12
 
13
+ logger = logging.getLogger(__name__)
11
14
 
12
- def traceable(trace_name=None):
15
+
16
+ def traceable(trace_name: Optional[str] = None):
13
17
  """
14
18
  A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
15
19
  based on the presence of a 'config::add_trace_tagging' flag.
16
20
 
17
21
  This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
18
22
  message's metadata. If so, it records the entry and exit timestamps of the function
19
- execution, using either a provided custom trace name or the function's name by default.
23
+ execution, using either a provided custom trace name, auto-detected stage name from
24
+ self.stage_name, or the function's name as fallback.
20
25
 
21
26
  Parameters
22
27
  ----------
23
28
  trace_name : str, optional
24
- A custom name for the trace entries in the message metadata. If not provided, the
25
- function's name is used by default.
29
+ A custom name for the trace entries in the message metadata. If not provided,
30
+ attempts to use self.stage_name from the decorated method's instance,
31
+ falling back to the function's name if neither is available.
26
32
 
27
33
  Returns
28
34
  -------
@@ -41,26 +47,48 @@ def traceable(trace_name=None):
41
47
  - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
42
48
  - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
43
49
 
44
- Example
45
- -------
46
- Applying the decorator without a custom trace name:
50
+ Examples
51
+ --------
52
+ Automatic stage name detection (recommended):
47
53
 
48
- >>> @traceable()
49
- ... def process_message(message):
54
+ >>> @traceable() # Uses self.stage_name automatically
55
+ ... def process_message(self, message):
50
56
  ... pass
51
57
 
52
- Applying the decorator with a custom trace name on a class method:
58
+ Explicit trace name (override):
53
59
 
54
- >>> class Processor:
55
- ... @traceable(trace_name="CustomTrace")
56
- ... def process(self, message):
57
- ... pass
60
+ >>> @traceable("custom_trace")
61
+ ... def process_message(self, message):
62
+ ... pass
63
+
64
+ Function without instance (uses function name):
65
+
66
+ >>> @traceable()
67
+ ... def process_message(message):
68
+ ... pass
58
69
  """
59
70
 
60
71
  def decorator_trace_tagging(func):
61
72
  @functools.wraps(func)
62
73
  def wrapper_trace_tagging(*args, **kwargs):
63
74
  ts_fetched = datetime.now()
75
+
76
+ # Determine the trace name to use
77
+ resolved_trace_name = trace_name
78
+
79
+ # If no explicit trace_name provided, try to get it from self.stage_name
80
+ if resolved_trace_name is None and len(args) >= 1:
81
+ stage_instance = args[0] # 'self' in method calls
82
+ if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
83
+ resolved_trace_name = stage_instance.stage_name
84
+ logger.debug(f"Using auto-detected trace name: '{resolved_trace_name}'")
85
+ else:
86
+ resolved_trace_name = func.__name__
87
+ logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
88
+ elif resolved_trace_name is None:
89
+ resolved_trace_name = func.__name__
90
+ logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
91
+
64
92
  # Determine which argument is the message.
65
93
  if hasattr(args[0], "has_metadata"):
66
94
  message = args[0]
@@ -73,7 +101,7 @@ def traceable(trace_name=None):
73
101
  message.get_metadata("config::add_trace_tagging") is True
74
102
  )
75
103
 
76
- trace_prefix = trace_name if trace_name else func.__name__
104
+ trace_prefix = resolved_trace_name
77
105
 
78
106
  if do_trace_tagging:
79
107
  ts_send = message.get_timestamp("latency::ts_send")
@@ -199,3 +227,62 @@ def traceable_func(trace_name=None, dedupe=True):
199
227
  return wrapper_inject_trace_info
200
228
 
201
229
  return decorator_inject_trace_info
230
+
231
+
232
+ def set_trace_timestamps_with_parent_context(control_message, execution_trace_log: dict, parent_name: str, logger=None):
233
+ """
234
+ Set trace timestamps on a control message with proper parent-child context.
235
+
236
+ This utility function processes trace timestamps from an execution_trace_log and
237
+ ensures that child traces are properly namespaced under their parent context.
238
+ This resolves OpenTelemetry span hierarchy issues where child spans cannot
239
+ find their expected parent contexts.
240
+
241
+ Parameters
242
+ ----------
243
+ control_message : IngestControlMessage
244
+ The control message to set timestamps on
245
+ execution_trace_log : dict
246
+ Dictionary of trace keys to timestamp values from internal operations
247
+ parent_name : str
248
+ The parent stage name to use as context for child traces
249
+ logger : logging.Logger, optional
250
+ Logger for debug output of key transformations
251
+
252
+ Examples
253
+ --------
254
+ Basic usage in a stage:
255
+
256
+ >>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2}
257
+ >>> set_trace_timestamps_with_parent_context(
258
+ ... control_message, execution_trace_log, "pdf_extractor", logger
259
+ ... )
260
+
261
+ This transforms:
262
+ - trace::entry::yolox_inference -> trace::entry::pdf_extractor::yolox_inference
263
+ - trace::exit::yolox_inference -> trace::exit::pdf_extractor::yolox_inference
264
+ """
265
+ if not execution_trace_log:
266
+ return
267
+
268
+ for key, ts in execution_trace_log.items():
269
+ enhanced_key = key
270
+
271
+ # Check if this is a child trace that needs parent context
272
+ if key.startswith("trace::") and "::" in key:
273
+ # Parse the trace key to extract the base trace name
274
+ parts = key.split("::")
275
+ if len(parts) >= 3: # e.g., ["trace", "entry", "yolox_inference"]
276
+ trace_type = parts[1] # "entry" or "exit"
277
+ child_name = "::".join(parts[2:]) # everything after trace::entry:: or trace::exit::
278
+
279
+ # Only rewrite if it doesn't already include the parent context
280
+ if not child_name.startswith(f"{parent_name}::"):
281
+ # Rewrite to include parent context: trace::entry::pdf_extractor::yolox_inference
282
+ enhanced_key = f"trace::{trace_type}::{parent_name}::{child_name}"
283
+
284
+ if logger:
285
+ logger.debug(f"Enhanced trace key: {key} -> {enhanced_key}")
286
+
287
+ # Set the timestamp with the (possibly enhanced) key
288
+ control_message.set_timestamp(enhanced_key, ts)