sdg-hub 0.4.2__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/integration-test.yml +48 -34
  2. sdg_hub-0.5.0/.github/workflows/packer.yml +33 -0
  3. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/test.yml +0 -13
  4. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/CLAUDE.md +0 -7
  5. {sdg_hub-0.4.2/src/sdg_hub.egg-info → sdg_hub-0.5.0}/PKG-INFO +2 -2
  6. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +1 -1
  7. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +2 -9
  8. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/pyproject.toml +1 -1
  9. sdg_hub-0.5.0/scripts/packer/centos.pkr.hcl +52 -0
  10. sdg_hub-0.5.0/scripts/packer/setup-centos.sh +80 -0
  11. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/_version.py +3 -3
  12. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/__init__.py +0 -22
  13. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  14. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/base.py +8 -80
  15. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  16. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  17. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  18. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  19. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
  20. {sdg_hub-0.4.2 → sdg_hub-0.5.0/src/sdg_hub.egg-info}/PKG-INFO +2 -2
  21. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/SOURCES.txt +3 -21
  22. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/requires.txt +1 -1
  23. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_json_structure_block.py +1 -1
  24. sdg_hub-0.4.2/tests/blocks/utilblocks/test_renameblock.py → sdg_hub-0.5.0/tests/blocks/transform/test_rename_columns.py +19 -19
  25. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_uniform_col_val_setter.py +1 -1
  26. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +73 -3
  27. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tox.ini +2 -2
  28. sdg_hub-0.4.2/.github/workflows/e2e.yml +0 -103
  29. sdg_hub-0.4.2/.github/workflows/packer.yml +0 -15
  30. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  31. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  32. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  33. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  34. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  35. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  36. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  37. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  38. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  39. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  40. sdg_hub-0.4.2/src/sdg_hub/core/flow/migration.py +0 -198
  41. sdg_hub-0.4.2/tests/blocks/deprecated/test_llmblock.py +0 -148
  42. sdg_hub-0.4.2/tests/blocks/utilblocks/test_combinecolumns.py +0 -168
  43. sdg_hub-0.4.2/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -112
  44. sdg_hub-0.4.2/tests/blocks/utilblocks/test_flattenblock.py +0 -217
  45. sdg_hub-0.4.2/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -37
  46. sdg_hub-0.4.2/tests/blocks/utilblocks/test_selectorblock.py +0 -144
  47. sdg_hub-0.4.2/tests/blocks/utilblocks/test_settomajority.py +0 -127
  48. sdg_hub-0.4.2/tests/flow/test_migration.py +0 -449
  49. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/actionlint.yaml +0 -0
  50. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/actions/free-disk-space/action.yml +0 -0
  51. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/dependabot.yml +0 -0
  52. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/mergify.yml +0 -0
  53. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/actionlint.dockerfile +0 -0
  54. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/actionlint.yml +0 -0
  55. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/docs.yml +0 -0
  56. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/lint.yml +0 -0
  57. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/matchers/actionlint.json +0 -0
  58. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/matchers/pylint.json +0 -0
  59. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/pypi.yaml +0 -0
  60. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.gitignore +0 -0
  61. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.isort.cfg +0 -0
  62. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.markdownlint-cli2.yaml +0 -0
  63. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.pre-commit-config.yaml +0 -0
  64. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.pylintrc +0 -0
  65. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/CONTRIBUTING.md +0 -0
  66. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/LICENSE +0 -0
  67. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/Makefile +0 -0
  68. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/README.md +0 -0
  69. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/.nojekyll +0 -0
  70. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/README.md +0 -0
  71. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_coverpage.md +0 -0
  72. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_navbar.md +0 -0
  73. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_sidebar.md +0 -0
  74. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/api-reference.md +0 -0
  75. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/custom-blocks.md +0 -0
  76. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/filtering-blocks.md +0 -0
  77. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/llm-blocks.md +0 -0
  78. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/overview.md +0 -0
  79. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/transform-blocks.md +0 -0
  80. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/concepts.md +0 -0
  81. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/development.md +0 -0
  82. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/flows/discovery.md +0 -0
  83. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/flows/overview.md +0 -0
  84. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/index.html +0 -0
  85. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/installation.md +0 -0
  86. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/quick-start.md +0 -0
  87. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/annotation_classification.ipynb +0 -0
  88. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
  89. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_flow.yaml +0 -0
  90. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_prompt.yaml +0 -0
  91. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
  92. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
  93. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  94. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
  95. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
  96. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  97. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  98. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/README.md +0 -0
  99. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  100. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  101. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  102. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  103. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  104. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  105. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  106. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  107. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  108. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  109. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  110. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/knowledge_utils.py +0 -0
  111. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/README.md +0 -0
  112. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  113. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
  114. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/scripts/ruff.sh +0 -0
  115. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/setup.cfg +0 -0
  116. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/__init__.py +0 -0
  117. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/__init__.py +0 -0
  118. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/base.py +0 -0
  119. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  120. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  121. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  122. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  123. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
  124. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
  125. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
  126. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  127. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
  128. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/registry.py +0 -0
  129. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  130. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  131. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  132. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
  133. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  134. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  135. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  136. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/__init__.py +0 -0
  137. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  138. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/metadata.py +0 -0
  139. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/registry.py +0 -0
  140. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/validation.py +0 -0
  141. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/__init__.py +0 -0
  142. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/datautils.py +0 -0
  143. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/error_handling.py +0 -0
  144. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  145. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  146. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
  147. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/logger_config.py +0 -0
  148. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  149. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/time_estimator.py +0 -0
  150. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  151. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  152. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  153. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  154. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  155. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
  156. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  157. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  158. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  159. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  160. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  161. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  162. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  163. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  164. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  165. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  166. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  167. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  168. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  169. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  170. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  171. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  172. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  173. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  174. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  175. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  176. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  177. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  178. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  179. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  180. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  181. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  182. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  183. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
  184. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  185. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/py.typed +0 -0
  186. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  187. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/top_level.txt +0 -0
  188. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/__init__.py +0 -0
  189. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  190. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  191. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
  192. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_parser_block.py +0 -0
  193. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  194. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_textparserblock.py +0 -0
  195. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/test_base_block.py +0 -0
  196. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/test_registry.py +0 -0
  197. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_config.yaml +0 -0
  198. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  199. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  200. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  201. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  202. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  203. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  204. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_melt_columns.py +0 -0
  205. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_text_concat.py +0 -0
  206. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/__init__.py +0 -0
  207. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/conftest.py +0 -0
  208. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_base.py +0 -0
  209. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_checkpointer.py +0 -0
  210. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_dataset_requirements.py +0 -0
  211. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_integration.py +0 -0
  212. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_metadata.py +0 -0
  213. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_registry.py +0 -0
  214. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_time_estimation.py +0 -0
  215. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_validation.py +0 -0
  216. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/README.md +0 -0
  217. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/__init__.py +0 -0
  218. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  219. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  220. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  221. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
  222. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_datautils.py +0 -0
  223. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_error_handling.py +0 -0
  224. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_flow_metrics.py +0 -0
  225. {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_path_resolution.py +0 -0
@@ -7,29 +7,11 @@ on:
7
7
  branches:
8
8
  - "main"
9
9
  - "release-**"
10
- paths:
11
- # Only trigger on changes to relevant flows and examples (EXTEND THIS):
12
- - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
13
- - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
14
- # Standard integration test triggers, DONT CHANGE THIS
15
- - 'tests/integration/**/*.py'
16
- - 'pyproject.toml'
17
- - 'tox.ini'
18
- - '.github/workflows/integration-test.yml'
19
10
  pull_request:
20
11
  branches:
21
12
  - "main"
22
13
  - "release-**"
23
14
  types: [opened, synchronize, reopened, labeled]
24
- paths:
25
- # Only trigger on changes to relevant flows and examples (EXTEND THIS):
26
- - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
27
- - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
28
- # Standard integration test triggers, DONT CHANGE THIS
29
- - 'tests/integration/**/*.py'
30
- - 'pyproject.toml'
31
- - 'tox.ini'
32
- - '.github/workflows/integration-test.yml'
33
15
 
34
16
  env:
35
17
  LC_ALL: en_US.UTF-8
@@ -42,19 +24,58 @@ permissions:
42
24
  contents: read
43
25
 
44
26
  jobs:
27
+ check-trigger:
28
+ name: "Check If Integration Should Run"
29
+ runs-on: ubuntu-latest
30
+ outputs:
31
+ should_run: ${{ steps.check.outputs.should_run }}
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+
35
+ - uses: dorny/paths-filter@v3
36
+ id: filter
37
+ if: github.event_name == 'pull_request'
38
+ with:
39
+ filters: |
40
+ relevant:
41
+ # Only trigger on changes to relevant flows and examples (EXTEND THIS):
42
+ - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
43
+ - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
44
+ # Standard integration test triggers, DONT CHANGE THIS
45
+ - 'tests/integration/**/*.py'
46
+ - 'pyproject.toml'
47
+ - 'tox.ini'
48
+ - '.github/workflows/integration-test.yml'
49
+
50
+ - name: Determine if tests should run
51
+ id: check
52
+ run: |
53
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || [[ "${{ github.event_name }}" == "push" ]]; then
54
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
55
+ elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
56
+ # Check if from fork
57
+ if [[ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
58
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
59
+ # Check if labeled event with correct label
60
+ elif [[ "${{ github.event.action }}" == "labeled" ]] && [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-integration-tests') }}" == "true" ]]; then
61
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
62
+ # Check if relevant paths changed for non-labeled events
63
+ elif [[ "${{ github.event.action }}" != "labeled" ]] && [[ "${{ steps.filter.outputs.relevant }}" == "true" ]]; then
64
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
65
+ else
66
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
67
+ fi
68
+ else
69
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
70
+ fi
71
+
45
72
  integration-test:
46
73
  name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
47
74
  runs-on: "${{ matrix.platform }}"
75
+ needs: check-trigger
76
+ if: needs.check-trigger.outputs.should_run == 'true'
48
77
  # Require manual approval before running (via GitHub Environment)
49
78
  environment: integration-tests
50
- # Skip fork PRs (they can't access environment secrets anyway)
51
- # Also check for 'run-integration-tests' label on labeled events
52
- if: |
53
- github.event_name == 'workflow_dispatch' ||
54
- github.event_name == 'push' ||
55
- (github.event_name == 'pull_request' &&
56
- github.event.pull_request.head.repo.full_name == github.repository &&
57
- (github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
58
79
  strategy:
59
80
  matrix:
60
81
  python:
@@ -89,12 +110,9 @@ jobs:
89
110
  **/pyproject.toml
90
111
  **/requirements*.txt
91
112
 
92
- - name: Remove llama-cpp-python from cache
93
- run: |
94
- pip cache remove llama_cpp_python
95
113
 
96
114
  - name: Cache huggingface datasets
97
- uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
115
+ uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
98
116
  with:
99
117
  path: ~/.cache/huggingface
100
118
  # Invalidate cache when any example notebook changes (may affect dataset downloads)
@@ -111,10 +129,6 @@ jobs:
111
129
  run: |
112
130
  tox -e py3-integrationcov
113
131
 
114
- - name: Remove llama-cpp-python from cache
115
- if: always()
116
- run: |
117
- pip cache remove llama_cpp_python
118
132
 
119
133
  - name: Upload integration test coverage to Codecov
120
134
  uses: codecov/codecov-action@v4
@@ -0,0 +1,33 @@
1
+ name: Build AMI with Packer
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ build-ami:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ id-token: write # This is required for OIDC
11
+ contents: read
12
+
13
+ steps:
14
+ - name: Checkout repository
15
+ uses: actions/checkout@v4
16
+
17
+ - name: Configure AWS Credentials
18
+ uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c
19
+ with:
20
+ role-to-assume: arn:aws:iam::851725220677:role/github-actions-packer-role
21
+ aws-region: us-east-2
22
+ role-session-name: github-actions-packer # For tracking in CloudTrail
23
+
24
+ - name: Setup Packer
25
+ uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232
26
+
27
+ - name: Build and create AMI
28
+ run: |
29
+ set -euo pipefail
30
+ cd scripts/packer
31
+ packer init .
32
+ packer validate .
33
+ packer build .
@@ -86,16 +86,7 @@ jobs:
86
86
  **/pyproject.toml
87
87
  **/requirements*.txt
88
88
 
89
- - name: Remove llama-cpp-python from cache
90
- run: |
91
- pip cache remove llama_cpp_python
92
89
 
93
- - name: Cache huggingface
94
- uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
95
- with:
96
- path: ~/.cache/huggingface
97
- # config contains DEFAULT_MODEL
98
- key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
99
90
 
100
91
  - name: Install dependencies
101
92
  run: |
@@ -107,10 +98,6 @@ jobs:
107
98
  tox -e py3-unitcov
108
99
 
109
100
 
110
- - name: Remove llama-cpp-python from cache
111
- if: always()
112
- run: |
113
- pip cache remove llama_cpp_python
114
101
 
115
102
  - name: Upload coverage to Codecov
116
103
  uses: codecov/codecov-action@v4
@@ -86,7 +86,6 @@ The framework is built around a modular block system with **composability at its
86
86
  - `transform/`: Data transformation blocks (column operations, text manipulation)
87
87
  - `filtering/`: Data filtering blocks with quality thresholds
88
88
  - `evaluation/`: Quality evaluation blocks (faithfulness, relevancy assessment)
89
- - `deprecated_blocks/`: Legacy blocks maintained for backward compatibility
90
89
 
91
90
  **Key Benefits**: Type-safe composition, automatic validation, rich logging, and high-performance async processing.
92
91
 
@@ -97,7 +96,6 @@ Flows orchestrate multiple blocks into data processing pipelines:
97
96
  - **FlowRegistry** (`src/sdg_hub/core/flow/registry.py`): Registry for flow discovery
98
97
  - **FlowMetadata** (`src/sdg_hub/core/flow/metadata.py`): Metadata and parameter definitions
99
98
  - **FlowValidator** (`src/sdg_hub/core/flow/validation.py`): YAML structure validation
100
- - **FlowMigration** (`src/sdg_hub/core/flow/migration.py`): Backward compatibility for old flow formats
101
99
 
102
100
  ### Flow Configuration
103
101
  Flows are defined in YAML files with this structure:
@@ -148,11 +146,6 @@ All blocks operate on HuggingFace `datasets.Dataset` objects:
148
146
  - Rich logging provides processing summaries
149
147
  - Empty dataset handling with appropriate errors
150
148
 
151
- ### Backward Compatibility
152
- The framework maintains compatibility with legacy formats:
153
- - Deprecated blocks are preserved in `deprecated_blocks/`
154
- - Flow migration automatically converts old YAML formats
155
- - Legacy LLMBlocks receive special handling during execution
156
149
 
157
150
  ## Testing Guidelines
158
151
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.2
3
+ Version: 0.5.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click<9.0.0,>=8.1.7
26
- Requires-Dist: datasets<4.0.0,>=2.18.0
26
+ Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
@@ -359,7 +359,7 @@
359
359
  "processed_knowledge_dataset = processed_knowledge_dataset.remove_columns(['messages']).rename_column('messages_without_think', 'messages')\n",
360
360
  "\n",
361
361
  "cfg = RAFTConfig(k_passages=5, max_tokens_per_chunk=400, p_include_oracle=0.9)\n",
362
- "raft_samples = build_raft_samples(ds, cfg)\n",
362
+ "raft_samples = build_raft_samples(processed_knowledge_dataset, cfg)\n",
363
363
  "raft_samples = raft_samples.map(build_messages).remove_columns(['question', 'context', 'oracle_context', 'cot_answer', 'answer', 'instruction', 'type', 'meta'])\n",
364
364
  "\n",
365
365
  "fp = \"<Instruction/Skills dataset>\" # TODO: Replace with huggingface dataset path once its uploaded\n",
@@ -237,20 +237,13 @@ def build_messages(raft_record: Dict[str, Any]):
237
237
  Output:
238
238
  messages: list of {"role": "system"|"user"|"assistant", "content": str}
239
239
  """
240
- # 1. System message
241
- sys_msg = raft_record.get("instruction") or (
242
- "You are a domain expert. You must answer questions by first quoting a span "
243
- "verbatim from the relevant passage, then giving reasoning, then the final answer. "
244
- "Ignore distractor passages."
245
- )
246
-
247
- # 2. User message: serialize passages + question
240
+ # 1. User message: serialize passages + question
248
241
  passages = "\n\n".join(
249
242
  [f"[Passage {i+1}] {p}" for i, p in enumerate(raft_record["context"])]
250
243
  )
251
244
  user_msg = f"Passages:\n{passages}\n\nQuestion: {raft_record['question']}"
252
245
 
253
- # 3. Assistant message: the gold output
246
+ # 2. Assistant message: the gold output
254
247
  assistant_msg = raft_record["answer"]
255
248
 
256
249
  return {"messages" : [
@@ -30,7 +30,7 @@ classifiers = [
30
30
  # Core dependencies moved from requirements.txt
31
31
  dependencies = [
32
32
  "click>=8.1.7,<9.0.0",
33
- "datasets>=2.18.0,<4.0.0",
33
+ "datasets>=4.0.0",
34
34
  "httpx>=0.25.0,<1.0.0",
35
35
  "jinja2",
36
36
  "litellm>=1.73.0,<1.75.0",
@@ -0,0 +1,52 @@
1
+ packer {
2
+ required_plugins {
3
+ amazon = {
4
+ version = ">= 1.2.8"
5
+ source = "github.com/hashicorp/amazon"
6
+ }
7
+ }
8
+ }
9
+
10
+ variable "github_sha" {
11
+ type = string
12
+ description = "GitHub commit SHA to tag the AMI with"
13
+ default = env("GITHUB_SHA")
14
+ }
15
+
16
+ variable "github_repository" {
17
+ type = string
18
+ description = "GitHub repository name to tag the AMI with"
19
+ default = env("GITHUB_REPOSITORY")
20
+ }
21
+
22
+ source "amazon-ebs" "centos" {
23
+ ami_name = "github-actions-centos-nvidia-ami-{{timestamp}}"
24
+ # Use the lowest-cost instance type that can efficiently build and santity-check the driver.
25
+ # It should be old enough to be low-cost, but new enough to be compatible with our desired driver version.
26
+ instance_type = "g6.xlarge"
27
+ region = "us-east-2"
28
+ source_ami_filter {
29
+ filters = {
30
+ name = "CentOS Stream 9 x86_64*"
31
+ root-device-type = "ebs"
32
+ virtualization-type = "hvm"
33
+ }
34
+ most_recent = true
35
+ owners = ["125523088429"] # CentOS CPE team ID.
36
+ }
37
+ ssh_username = "ec2-user"
38
+ tags = {
39
+ Name = "CentOS Stream 9 with Nvidia Drivers"
40
+ BuiltBy = "Packer"
41
+ GitHubCommitSHA = var.github_sha
42
+ GitHubRepository = var.github_repository
43
+ }
44
+ }
45
+
46
+ build {
47
+ sources = ["source.amazon-ebs.centos"]
48
+ provisioner "shell" {
49
+ script = "./setup-centos.sh"
50
+ execute_command = "sudo bash {{.Path}}"
51
+ }
52
+ }
@@ -0,0 +1,80 @@
1
+ #!/bin/bash
2
+ # Setup script for CentOS GitHub Actions AMI
3
+ # Derived from:
4
+ # github.com/containers/ai-lab-recipes/blob/main/training/nvidia-bootc/Containerfile
5
+
6
+ set -euxo pipefail
7
+
8
+ DRIVER_VERSION="580.65.06"
9
+ # CUDA_VERSION is embedded in the driver "local repo" package
10
+
11
+ if [[ $(id -u) != "0" ]]; then
12
+ echo "you must run this script as root."
13
+ exit 1
14
+ fi
15
+
16
+ function configure_dnf {
17
+ # Configure the DNF repos and options we need for CI.
18
+ dnf -y install dnf-plugins-core
19
+ dnf config-manager --save \
20
+ --setopt=skip_missing_names_on_install=False \
21
+ --setopt=install_weak_deps=False
22
+
23
+ dnf -y install epel-release
24
+ dnf -y install https://us.download.nvidia.com/tesla/$DRIVER_VERSION/nvidia-driver-local-repo-rhel9-$DRIVER_VERSION-1.0-1.x86_64.rpm
25
+ # TODO: We might be able to use a nvidia.com yum repo instead of the local repo?
26
+ # dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel${OS_VERSION_MAJOR}/${CUDA_REPO_ARCH}/cuda-rhel${OS_VERSION_MAJOR}.repo
27
+ }
28
+
29
+ function install_userland_packages {
30
+ # CI tests in GH Actions will require these packages:
31
+ dnf -y install nvtop podman skopeo git python3.12 python3.12-devel
32
+ }
33
+
34
+ function install_kernel_driver {
35
+ # Install nvidia kernel driver.
36
+ # DKMS will compile the nvidia.ko driver for all kernels for which we have installed a kernel-devel package.
37
+ # By default, the "dnf module install" command will install the latest kernel-devel package that CentOS has published.
38
+ dnf -y install "kernel-devel-$(uname -r)" gcc make dkms elfutils-libelf-devel # also build for the currently-running kernel.
39
+ # If we had configured a previous nvidia-driver version with DNF, reset it:
40
+ dnf -y module reset nvidia-driver || true
41
+ DRIVER_STREAM=$(echo $DRIVER_VERSION | cut -d. -f1)
42
+ dnf -y module install nvidia-driver:${DRIVER_STREAM}-dkms # or use :latest-dkms after confirming available streams
43
+ }
44
+
45
+ function test_kernel_driver {
46
+ # The nvidia driver DNF module (above) installs a dkms RPM.
47
+ # That dkms RPM compiles and installs the nvidia.ko module.
48
+ # List all the modules that dkms has compiled:
49
+ dkms status || true
50
+ # Load the module (ok if it’s already loaded or unavailable for this kernel):
51
+ modprobe -q nvidia || true
52
+ # If a GPU is present, verify userspace; otherwise, fail the job:
53
+ nvidia-smi
54
+ }
55
+
56
+ function install_container_toolkit {
57
+ # Install nvidia container toolkit.
58
+ # When we pass GPU devices to a container (podman run --device nvidia.com/gpu=all), we use the nvidia CTK to do that.
59
+ # See docs at https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
60
+ curl -sSfL -o /etc/yum.repos.d/nvidia-container-toolkit.repo \
61
+ https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
62
+ dnf config-manager --enable nvidia-container-toolkit-experimental
63
+ export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
64
+ dnf install -y \
65
+ nvidia-container-toolkit-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
66
+ nvidia-container-toolkit-base-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
67
+ libnvidia-container-tools-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
68
+ libnvidia-container1-${NVIDIA_CONTAINER_TOOLKIT_VERSION}
69
+ # Verify it works:
70
+ nvidia-ctk --version
71
+ # When you boot a node, you must run:
72
+ # sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
73
+ # This command scans your system for NVIDIA GPUs and creates a YAML file that lists the available devices.
74
+ }
75
+
76
+ configure_dnf
77
+ install_userland_packages
78
+ install_kernel_driver
79
+ test_kernel_driver
80
+ install_container_toolkit
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.2'
32
- __version_tuple__ = version_tuple = (0, 4, 2)
31
+ __version__ = version = '0.5.0'
32
+ __version_tuple__ = version_tuple = (0, 5, 0)
33
33
 
34
- __commit_id__ = commit_id = 'gfbb2504ba'
34
+ __commit_id__ = commit_id = 'ge1e260984'
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
5
5
 
6
6
  # Local
7
7
  from .base import BaseBlock
8
- from .deprecated_blocks import (
9
- CombineColumnsBlock,
10
- DuplicateColumns,
11
- FilterByValueBlock,
12
- FlattenColumnsBlock,
13
- LLMBlock,
14
- RenameColumns,
15
- SamplePopulatorBlock,
16
- SelectorBlock,
17
- SetToMajorityValue,
18
- )
19
8
  from .filtering import ColumnValueFilterBlock
20
9
  from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
21
10
  from .registry import BlockRegistry
@@ -28,8 +17,6 @@ from .transform import (
28
17
  UniformColumnValueSetter,
29
18
  )
30
19
 
31
- # All blocks moved to deprecated_blocks or transform modules
32
-
33
20
  __all__ = [
34
21
  "BaseBlock",
35
22
  "BlockRegistry",
@@ -40,15 +27,6 @@ __all__ = [
40
27
  "RenameColumnsBlock",
41
28
  "TextConcatBlock",
42
29
  "UniformColumnValueSetter",
43
- "CombineColumnsBlock", # Deprecated
44
- "DuplicateColumns", # Deprecated
45
- "FilterByValueBlock", # Deprecated
46
- "FlattenColumnsBlock", # Deprecated
47
- "RenameColumns", # Deprecated
48
- "SamplePopulatorBlock", # Deprecated
49
- "SelectorBlock", # Deprecated
50
- "SetToMajorityValue", # Deprecated
51
- "LLMBlock", # Deprecated
52
30
  "LLMChatBlock",
53
31
  "LLMParserBlock",
54
32
  "TextParserBlock",
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
64
64
  -------
65
65
  Dataset
66
66
  Dataset with renamed columns.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If attempting to rename to a column name that already exists.
67
72
  """
73
+ # Check for column name collisions
74
+ # Strict validation: no target column name can be an existing column name
75
+ # This prevents chained/circular renames which can be confusing
76
+ existing_cols = set(samples.column_names)
77
+ target_cols = set(self.input_cols.values())
78
+
79
+ collision = target_cols & existing_cols
80
+ if collision:
81
+ raise ValueError(
82
+ f"Cannot rename to existing column names: {sorted(collision)}. "
83
+ "Target column names must not already exist in the dataset. "
84
+ "Chained renames are not supported."
85
+ )
86
+
68
87
  # Rename columns using HuggingFace datasets method
69
88
  return samples.rename_columns(self.input_cols)
@@ -41,7 +41,6 @@ from ..utils.time_estimator import estimate_execution_time
41
41
  from ..utils.yaml_utils import save_flow_yaml
42
42
  from .checkpointer import FlowCheckpointer
43
43
  from .metadata import DatasetRequirements, FlowMetadata
44
- from .migration import FlowMigration
45
44
  from .validation import FlowValidator
46
45
 
47
46
  logger = setup_logger(__name__)
@@ -73,8 +72,6 @@ class Flow(BaseModel):
73
72
  model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
74
73
 
75
74
  # Private attributes (not serialized)
76
- _migrated_runtime_params: dict[str, dict[str, Any]] = {}
77
- _llm_client: Any = None # Only used for backward compatibility with old YAMLs
78
75
  _model_config_set: bool = False # Track if model configuration has been set
79
76
  _block_metrics: list[dict[str, Any]] = PrivateAttr(
80
77
  default_factory=list
@@ -113,16 +110,13 @@ class Flow(BaseModel):
113
110
  return self
114
111
 
115
112
  @classmethod
116
- def from_yaml(cls, yaml_path: str, client: Any = None) -> "Flow":
113
+ def from_yaml(cls, yaml_path: str) -> "Flow":
117
114
  """Load flow from YAML configuration file.
118
115
 
119
116
  Parameters
120
117
  ----------
121
118
  yaml_path : str
122
119
  Path to the YAML flow configuration file.
123
- client : Any, optional
124
- LLM client instance. Required for backward compatibility with old format YAMLs
125
- that use deprecated LLMBlocks. Ignored for new format YAMLs.
126
120
 
127
121
  Returns
128
122
  -------
@@ -153,21 +147,6 @@ class Flow(BaseModel):
153
147
  except yaml.YAMLError as exc:
154
148
  raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
155
149
 
156
- # Check if this is an old format flow and migrate if necessary
157
- migrated_runtime_params = None
158
- is_old_format = FlowMigration.is_old_format(flow_config)
159
- if is_old_format:
160
- logger.info(f"Detected old format flow, migrating: {yaml_path}")
161
- if client is None:
162
- logger.warning(
163
- "Old format YAML detected but no client provided. LLMBlocks may fail."
164
- )
165
- flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
166
- flow_config, yaml_path
167
- )
168
- # Save migrated config back to YAML to persist id
169
- save_flow_yaml(yaml_path, flow_config, "migrated to new format")
170
-
171
150
  # Validate YAML structure
172
151
  validator = FlowValidator()
173
152
  validation_errors = validator.validate_yaml_structure(flow_config)
@@ -194,19 +173,6 @@ class Flow(BaseModel):
194
173
 
195
174
  for i, block_config in enumerate(block_configs):
196
175
  try:
197
- # Inject client for deprecated LLMBlocks if this is an old format flow
198
- if (
199
- is_old_format
200
- and block_config.get("block_type") == "LLMBlock"
201
- and client is not None
202
- ):
203
- if "block_config" not in block_config:
204
- block_config["block_config"] = {}
205
- block_config["block_config"]["client"] = client
206
- logger.debug(
207
- f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
208
- )
209
-
210
176
  block = cls._create_block_from_config(block_config, yaml_dir)
211
177
  blocks.append(block)
212
178
  except Exception as exc:
@@ -228,12 +194,6 @@ class Flow(BaseModel):
228
194
  )
229
195
  else:
230
196
  logger.debug(f"Flow already had id: {flow.metadata.id}")
231
- # Store migrated runtime params and client for backward compatibility
232
- if migrated_runtime_params:
233
- flow._migrated_runtime_params = migrated_runtime_params
234
- if is_old_format and client is not None:
235
- flow._llm_client = client
236
-
237
197
  # Check if this is a flow without LLM blocks
238
198
  llm_blocks = flow._detect_llm_blocks()
239
199
  if not llm_blocks:
@@ -484,12 +444,6 @@ class Flow(BaseModel):
484
444
  self._block_metrics = []
485
445
  run_start = time.perf_counter()
486
446
 
487
- # Merge migrated runtime params with provided ones (provided ones take precedence)
488
- merged_runtime_params = self._migrated_runtime_params.copy()
489
- if runtime_params:
490
- merged_runtime_params.update(runtime_params)
491
- runtime_params = merged_runtime_params
492
-
493
447
  # Execute flow with metrics capture, ensuring metrics are always displayed/saved
494
448
  final_dataset = None
495
449
  execution_successful = False
@@ -647,22 +601,8 @@ class Flow(BaseModel):
647
601
  input_cols = set(current_dataset.column_names)
648
602
 
649
603
  try:
650
- # Check if this is a deprecated block and skip validations
651
- is_deprecated_block = (
652
- hasattr(block, "__class__")
653
- and hasattr(block.__class__, "__module__")
654
- and "deprecated_blocks" in block.__class__.__module__
655
- )
656
-
657
- if is_deprecated_block:
658
- exec_logger.debug(
659
- f"Skipping validations for deprecated block: {block.block_name}"
660
- )
661
- # Call generate() directly to skip validations, but keep the runtime params
662
- current_dataset = block.generate(current_dataset, **block_kwargs)
663
- else:
664
- # Execute block with validation and logging
665
- current_dataset = block(current_dataset, **block_kwargs)
604
+ # Execute block with validation and logging
605
+ current_dataset = block(current_dataset, **block_kwargs)
666
606
 
667
607
  # Validate output
668
608
  if len(current_dataset) == 0:
@@ -724,9 +664,11 @@ class Flow(BaseModel):
724
664
  return current_dataset
725
665
 
726
666
  def _prepare_block_kwargs(
727
- self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
667
+ self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
728
668
  ) -> dict[str, Any]:
729
669
  """Prepare execution parameters for a block."""
670
+ if runtime_params is None:
671
+ return {}
730
672
  return runtime_params.get(block.block_name, {})
731
673
 
732
674
  def set_model_config(
@@ -1114,22 +1056,8 @@ class Flow(BaseModel):
1114
1056
  if max_concurrency is not None:
1115
1057
  block_kwargs["_flow_max_concurrency"] = max_concurrency
1116
1058
 
1117
- # Check if this is a deprecated block and skip validations
1118
- is_deprecated_block = (
1119
- hasattr(block, "__class__")
1120
- and hasattr(block.__class__, "__module__")
1121
- and "deprecated_blocks" in block.__class__.__module__
1122
- )
1123
-
1124
- if is_deprecated_block:
1125
- logger.debug(
1126
- f"Dry run: Skipping validations for deprecated block: {block.block_name}"
1127
- )
1128
- # Call generate() directly to skip validations, but keep the runtime params
1129
- current_dataset = block.generate(current_dataset, **block_kwargs)
1130
- else:
1131
- # Execute block with validation and logging
1132
- current_dataset = block(current_dataset, **block_kwargs)
1059
+ # Execute block with validation and logging
1060
+ current_dataset = block(current_dataset, **block_kwargs)
1133
1061
 
1134
1062
  block_execution_time = (
1135
1063
  time.perf_counter() - block_start_time
@@ -77,9 +77,13 @@ blocks:
77
77
  - ''
78
78
  - block_type: RenameColumnsBlock
79
79
  block_config:
80
- block_name: rename_to_document_column
80
+ block_name: rename_to_raw_document_column
81
81
  input_cols:
82
82
  document: raw_document
83
+ - block_type: RenameColumnsBlock
84
+ block_config:
85
+ block_name: rename_to_document_column
86
+ input_cols:
83
87
  summary: document
84
88
  - block_type: PromptBuilderBlock
85
89
  block_config: