sdg-hub 0.4.2__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/integration-test.yml +48 -34
  2. sdg_hub-0.5.1/.github/workflows/packer.yml +33 -0
  3. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/test.yml +0 -13
  4. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/CLAUDE.md +0 -7
  5. {sdg_hub-0.4.2/src/sdg_hub.egg-info → sdg_hub-0.5.1}/PKG-INFO +2 -2
  6. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/transform-blocks.md +2 -2
  7. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/flows/overview.md +348 -1
  8. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +1 -1
  9. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +2 -9
  10. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/pyproject.toml +1 -1
  11. sdg_hub-0.5.1/scripts/packer/centos.pkr.hcl +52 -0
  12. sdg_hub-0.5.1/scripts/packer/setup-centos.sh +80 -0
  13. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/_version.py +3 -3
  14. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/__init__.py +0 -22
  15. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +57 -5
  16. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/text_parser_block.py +57 -5
  17. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  18. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/base.py +57 -80
  19. sdg_hub-0.5.1/src/sdg_hub/core/utils/temp_manager.py +57 -0
  20. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  21. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  22. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  23. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  24. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
  25. {sdg_hub-0.4.2 → sdg_hub-0.5.1/src/sdg_hub.egg-info}/PKG-INFO +2 -2
  26. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/SOURCES.txt +4 -21
  27. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/requires.txt +1 -1
  28. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_json_structure_block.py +1 -1
  29. sdg_hub-0.4.2/tests/blocks/utilblocks/test_renameblock.py → sdg_hub-0.5.1/tests/blocks/transform/test_rename_columns.py +19 -19
  30. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_uniform_col_val_setter.py +1 -1
  31. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +73 -3
  32. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tox.ini +2 -2
  33. sdg_hub-0.4.2/.github/workflows/e2e.yml +0 -103
  34. sdg_hub-0.4.2/.github/workflows/packer.yml +0 -15
  35. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  36. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  37. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  38. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  39. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  40. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  41. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  42. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  43. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  44. sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  45. sdg_hub-0.4.2/src/sdg_hub/core/flow/migration.py +0 -198
  46. sdg_hub-0.4.2/tests/blocks/deprecated/test_llmblock.py +0 -148
  47. sdg_hub-0.4.2/tests/blocks/utilblocks/test_combinecolumns.py +0 -168
  48. sdg_hub-0.4.2/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -112
  49. sdg_hub-0.4.2/tests/blocks/utilblocks/test_flattenblock.py +0 -217
  50. sdg_hub-0.4.2/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -37
  51. sdg_hub-0.4.2/tests/blocks/utilblocks/test_selectorblock.py +0 -144
  52. sdg_hub-0.4.2/tests/blocks/utilblocks/test_settomajority.py +0 -127
  53. sdg_hub-0.4.2/tests/flow/test_migration.py +0 -449
  54. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/actionlint.yaml +0 -0
  55. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/actions/free-disk-space/action.yml +0 -0
  56. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/dependabot.yml +0 -0
  57. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/mergify.yml +0 -0
  58. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/actionlint.dockerfile +0 -0
  59. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/actionlint.yml +0 -0
  60. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/docs.yml +0 -0
  61. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/lint.yml +0 -0
  62. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/matchers/actionlint.json +0 -0
  63. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/matchers/pylint.json +0 -0
  64. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/pypi.yaml +0 -0
  65. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.gitignore +0 -0
  66. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.isort.cfg +0 -0
  67. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.markdownlint-cli2.yaml +0 -0
  68. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.pre-commit-config.yaml +0 -0
  69. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.pylintrc +0 -0
  70. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/CONTRIBUTING.md +0 -0
  71. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/LICENSE +0 -0
  72. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/Makefile +0 -0
  73. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/README.md +0 -0
  74. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/.nojekyll +0 -0
  75. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/README.md +0 -0
  76. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_coverpage.md +0 -0
  77. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_navbar.md +0 -0
  78. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_sidebar.md +0 -0
  79. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/api-reference.md +0 -0
  80. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/custom-blocks.md +0 -0
  81. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/filtering-blocks.md +0 -0
  82. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/llm-blocks.md +0 -0
  83. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/overview.md +0 -0
  84. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/concepts.md +0 -0
  85. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/development.md +0 -0
  86. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/flows/discovery.md +0 -0
  87. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/index.html +0 -0
  88. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/installation.md +0 -0
  89. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/quick-start.md +0 -0
  90. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/annotation_classification.ipynb +0 -0
  91. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
  92. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_flow.yaml +0 -0
  93. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_prompt.yaml +0 -0
  94. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
  95. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
  96. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  97. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
  98. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
  99. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  100. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  101. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/README.md +0 -0
  102. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  103. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  104. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  105. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  106. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  107. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  108. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  109. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  110. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  111. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  112. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  113. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/knowledge_utils.py +0 -0
  114. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/README.md +0 -0
  115. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  116. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
  117. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/scripts/ruff.sh +0 -0
  118. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/setup.cfg +0 -0
  119. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/__init__.py +0 -0
  120. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/__init__.py +0 -0
  121. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/base.py +0 -0
  122. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  123. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  124. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  125. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  126. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
  127. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
  128. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  129. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/registry.py +0 -0
  130. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  131. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  132. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  133. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
  134. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  135. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  136. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  137. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/__init__.py +0 -0
  138. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  139. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/metadata.py +0 -0
  140. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/registry.py +0 -0
  141. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/validation.py +0 -0
  142. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/__init__.py +0 -0
  143. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/datautils.py +0 -0
  144. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/error_handling.py +0 -0
  145. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  146. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  147. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
  148. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/logger_config.py +0 -0
  149. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  150. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/time_estimator.py +0 -0
  151. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  152. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  153. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  154. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  155. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  156. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
  157. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  158. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  159. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  160. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  161. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  162. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  163. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  164. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  165. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  166. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  167. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  168. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  169. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  170. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  171. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  172. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  173. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  174. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  175. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  176. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  177. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  178. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  179. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  180. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  181. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  182. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  183. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  184. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
  185. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  186. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/py.typed +0 -0
  187. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  188. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/top_level.txt +0 -0
  189. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/__init__.py +0 -0
  190. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  191. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  192. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
  193. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_parser_block.py +0 -0
  194. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  195. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_textparserblock.py +0 -0
  196. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/test_base_block.py +0 -0
  197. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/test_registry.py +0 -0
  198. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_config.yaml +0 -0
  199. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  200. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  201. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  202. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  203. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  204. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  205. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_melt_columns.py +0 -0
  206. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_text_concat.py +0 -0
  207. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/__init__.py +0 -0
  208. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/conftest.py +0 -0
  209. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_base.py +0 -0
  210. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_checkpointer.py +0 -0
  211. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_dataset_requirements.py +0 -0
  212. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_integration.py +0 -0
  213. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_metadata.py +0 -0
  214. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_registry.py +0 -0
  215. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_time_estimation.py +0 -0
  216. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_validation.py +0 -0
  217. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/README.md +0 -0
  218. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/__init__.py +0 -0
  219. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  220. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  221. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  222. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
  223. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_datautils.py +0 -0
  224. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_error_handling.py +0 -0
  225. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_flow_metrics.py +0 -0
  226. {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_path_resolution.py +0 -0
@@ -7,29 +7,11 @@ on:
7
7
  branches:
8
8
  - "main"
9
9
  - "release-**"
10
- paths:
11
- # Only trigger on changes to relevant flows and examples (EXTEND THIS):
12
- - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
13
- - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
14
- # Standard integration test triggers, DONT CHANGE THIS
15
- - 'tests/integration/**/*.py'
16
- - 'pyproject.toml'
17
- - 'tox.ini'
18
- - '.github/workflows/integration-test.yml'
19
10
  pull_request:
20
11
  branches:
21
12
  - "main"
22
13
  - "release-**"
23
14
  types: [opened, synchronize, reopened, labeled]
24
- paths:
25
- # Only trigger on changes to relevant flows and examples (EXTEND THIS):
26
- - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
27
- - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
28
- # Standard integration test triggers, DONT CHANGE THIS
29
- - 'tests/integration/**/*.py'
30
- - 'pyproject.toml'
31
- - 'tox.ini'
32
- - '.github/workflows/integration-test.yml'
33
15
 
34
16
  env:
35
17
  LC_ALL: en_US.UTF-8
@@ -42,19 +24,58 @@ permissions:
42
24
  contents: read
43
25
 
44
26
  jobs:
27
+ check-trigger:
28
+ name: "Check If Integration Should Run"
29
+ runs-on: ubuntu-latest
30
+ outputs:
31
+ should_run: ${{ steps.check.outputs.should_run }}
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+
35
+ - uses: dorny/paths-filter@v3
36
+ id: filter
37
+ if: github.event_name == 'pull_request'
38
+ with:
39
+ filters: |
40
+ relevant:
41
+ # Only trigger on changes to relevant flows and examples (EXTEND THIS):
42
+ - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
43
+ - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
44
+ # Standard integration test triggers, DONT CHANGE THIS
45
+ - 'tests/integration/**/*.py'
46
+ - 'pyproject.toml'
47
+ - 'tox.ini'
48
+ - '.github/workflows/integration-test.yml'
49
+
50
+ - name: Determine if tests should run
51
+ id: check
52
+ run: |
53
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || [[ "${{ github.event_name }}" == "push" ]]; then
54
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
55
+ elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
56
+ # Check if from fork
57
+ if [[ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
58
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
59
+ # Check if labeled event with correct label
60
+ elif [[ "${{ github.event.action }}" == "labeled" ]] && [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-integration-tests') }}" == "true" ]]; then
61
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
62
+ # Check if relevant paths changed for non-labeled events
63
+ elif [[ "${{ github.event.action }}" != "labeled" ]] && [[ "${{ steps.filter.outputs.relevant }}" == "true" ]]; then
64
+ echo "should_run=true" >> "$GITHUB_OUTPUT"
65
+ else
66
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
67
+ fi
68
+ else
69
+ echo "should_run=false" >> "$GITHUB_OUTPUT"
70
+ fi
71
+
45
72
  integration-test:
46
73
  name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
47
74
  runs-on: "${{ matrix.platform }}"
75
+ needs: check-trigger
76
+ if: needs.check-trigger.outputs.should_run == 'true'
48
77
  # Require manual approval before running (via GitHub Environment)
49
78
  environment: integration-tests
50
- # Skip fork PRs (they can't access environment secrets anyway)
51
- # Also check for 'run-integration-tests' label on labeled events
52
- if: |
53
- github.event_name == 'workflow_dispatch' ||
54
- github.event_name == 'push' ||
55
- (github.event_name == 'pull_request' &&
56
- github.event.pull_request.head.repo.full_name == github.repository &&
57
- (github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
58
79
  strategy:
59
80
  matrix:
60
81
  python:
@@ -89,12 +110,9 @@ jobs:
89
110
  **/pyproject.toml
90
111
  **/requirements*.txt
91
112
 
92
- - name: Remove llama-cpp-python from cache
93
- run: |
94
- pip cache remove llama_cpp_python
95
113
 
96
114
  - name: Cache huggingface datasets
97
- uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
115
+ uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
98
116
  with:
99
117
  path: ~/.cache/huggingface
100
118
  # Invalidate cache when any example notebook changes (may affect dataset downloads)
@@ -111,10 +129,6 @@ jobs:
111
129
  run: |
112
130
  tox -e py3-integrationcov
113
131
 
114
- - name: Remove llama-cpp-python from cache
115
- if: always()
116
- run: |
117
- pip cache remove llama_cpp_python
118
132
 
119
133
  - name: Upload integration test coverage to Codecov
120
134
  uses: codecov/codecov-action@v4
@@ -0,0 +1,33 @@
1
+ name: Build AMI with Packer
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ build-ami:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ id-token: write # This is required for OIDC
11
+ contents: read
12
+
13
+ steps:
14
+ - name: Checkout repository
15
+ uses: actions/checkout@v4
16
+
17
+ - name: Configure AWS Credentials
18
+ uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c
19
+ with:
20
+ role-to-assume: arn:aws:iam::851725220677:role/github-actions-packer-role
21
+ aws-region: us-east-2
22
+ role-session-name: github-actions-packer # For tracking in CloudTrail
23
+
24
+ - name: Setup Packer
25
+ uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232
26
+
27
+ - name: Build and create AMI
28
+ run: |
29
+ set -euo pipefail
30
+ cd scripts/packer
31
+ packer init .
32
+ packer validate .
33
+ packer build .
@@ -86,16 +86,7 @@ jobs:
86
86
  **/pyproject.toml
87
87
  **/requirements*.txt
88
88
 
89
- - name: Remove llama-cpp-python from cache
90
- run: |
91
- pip cache remove llama_cpp_python
92
89
 
93
- - name: Cache huggingface
94
- uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
95
- with:
96
- path: ~/.cache/huggingface
97
- # config contains DEFAULT_MODEL
98
- key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
99
90
 
100
91
  - name: Install dependencies
101
92
  run: |
@@ -107,10 +98,6 @@ jobs:
107
98
  tox -e py3-unitcov
108
99
 
109
100
 
110
- - name: Remove llama-cpp-python from cache
111
- if: always()
112
- run: |
113
- pip cache remove llama_cpp_python
114
101
 
115
102
  - name: Upload coverage to Codecov
116
103
  uses: codecov/codecov-action@v4
@@ -86,7 +86,6 @@ The framework is built around a modular block system with **composability at its
86
86
  - `transform/`: Data transformation blocks (column operations, text manipulation)
87
87
  - `filtering/`: Data filtering blocks with quality thresholds
88
88
  - `evaluation/`: Quality evaluation blocks (faithfulness, relevancy assessment)
89
- - `deprecated_blocks/`: Legacy blocks maintained for backward compatibility
90
89
 
91
90
  **Key Benefits**: Type-safe composition, automatic validation, rich logging, and high-performance async processing.
92
91
 
@@ -97,7 +96,6 @@ Flows orchestrate multiple blocks into data processing pipelines:
97
96
  - **FlowRegistry** (`src/sdg_hub/core/flow/registry.py`): Registry for flow discovery
98
97
  - **FlowMetadata** (`src/sdg_hub/core/flow/metadata.py`): Metadata and parameter definitions
99
98
  - **FlowValidator** (`src/sdg_hub/core/flow/validation.py`): YAML structure validation
100
- - **FlowMigration** (`src/sdg_hub/core/flow/migration.py`): Backward compatibility for old flow formats
101
99
 
102
100
  ### Flow Configuration
103
101
  Flows are defined in YAML files with this structure:
@@ -148,11 +146,6 @@ All blocks operate on HuggingFace `datasets.Dataset` objects:
148
146
  - Rich logging provides processing summaries
149
147
  - Empty dataset handling with appropriate errors
150
148
 
151
- ### Backward Compatibility
152
- The framework maintains compatibility with legacy formats:
153
- - Deprecated blocks are preserved in `deprecated_blocks/`
154
- - Flow migration automatically converts old YAML formats
155
- - Legacy LLMBlocks receive special handling during execution
156
149
 
157
150
  ## Testing Guidelines
158
151
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.2
3
+ Version: 0.5.1
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click<9.0.0,>=8.1.7
26
- Requires-Dist: datasets<4.0.0,>=2.18.0
26
+ Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
@@ -19,8 +19,8 @@ Maps values based on their position/index, useful for applying transformations b
19
19
  ### MeltColumnsBlock
20
20
  Reshapes data from wide format to long format, converting multiple columns into key-value pairs.
21
21
 
22
- ### UniformColValSetterBlock
23
- Sets uniform values across specified columns, useful for adding metadata or default values.
22
+ ### UniformColumnValueSetter
23
+ Replaces all values in a column with a single statistical aggregate (mode, min, max, mean, or median) computed from the data. Modifies the column in-place, useful for data normalization, creating baseline comparisons, or extracting dominant values.
24
24
 
25
25
 
26
26
  ## 🚀 Next Steps
@@ -116,7 +116,139 @@ metadata:
116
116
  max_samples: 10000
117
117
  ```
118
118
 
119
- #TODO: Add metadata fields information
119
+ ### Metadata Fields Reference
120
+
121
+ The metadata section supports the following fields for flow configuration:
122
+
123
+ #### Core Metadata Fields
124
+
125
+ | Field | Type | Required | Default | Description |
126
+ |-------|------|----------|---------|-------------|
127
+ | `name` | `string` | Yes | - | Human-readable name of the flow. Must be at least 1 character. |
128
+ | `id` | `string` | No | Auto-generated | Unique identifier for the flow. Auto-generated from name if not provided. Must be lowercase, contain only alphanumeric characters and hyphens, and not start/end with hyphens. |
129
+ | `description` | `string` | No | `""` | Detailed description of what the flow does and its purpose. |
130
+ | `version` | `string` | No | `"1.0.0"` | Semantic version following the format `MAJOR.MINOR.PATCH` (e.g., "1.0.0", "2.1.3-beta"). |
131
+ | `author` | `string` | No | `""` | Name of the flow author or contributor. |
132
+ | `license` | `string` | No | `"Apache-2.0"` | License identifier for the flow (e.g., "Apache-2.0", "MIT", "GPL-3.0"). |
133
+ | `tags` | `List[string]` | No | `[]` | List of tags for categorization and discovery. Tags are automatically converted to lowercase. |
134
+ | `recommended_models` | `RecommendedModels` | No | `None` | Recommended LLM models for optimal flow performance. See below for structure. |
135
+ | `dataset_requirements` | `DatasetRequirements` | No | `None` | Input dataset requirements and validation rules. See below for structure. |
136
+
137
+ #### RecommendedModels Structure
138
+
139
+ The `recommended_models` field helps users choose appropriate LLM models for the flow:
140
+
141
+ ```yaml
142
+ recommended_models:
143
+ default: "meta-llama/Llama-3.3-70B-Instruct"
144
+ compatible:
145
+ - "microsoft/phi-4"
146
+ - "mistralai/Mixtral-8x7B-Instruct-v0.1"
147
+ experimental:
148
+ - "google/gemini-pro"
149
+ ```
150
+
151
+ | Field | Type | Required | Default | Description |
152
+ |-------|------|----------|---------|-------------|
153
+ | `default` | `string` | Yes | - | The default model recommended for this flow. This is the primary model users should use. |
154
+ | `compatible` | `List[string]` | No | `[]` | List of models known to work well with this flow. Alternative options with good performance. |
155
+ | `experimental` | `List[string]` | No | `[]` | List of experimental models that may work but haven't been extensively tested with this flow. |
156
+
157
+ **Model Selection Behavior:**
158
+
159
+ When the framework needs to select a model, it prioritizes in this order:
160
+ 1. `default` model if available
161
+ 2. First available model from `compatible` list
162
+ 3. First available model from `experimental` list
163
+
164
+ #### DatasetRequirements Structure
165
+
166
+ The `dataset_requirements` field validates input datasets and documents expected data format:
167
+
168
+ ```yaml
169
+ dataset_requirements:
170
+ required_columns:
171
+ - "document"
172
+ - "context"
173
+ optional_columns:
174
+ - "metadata"
175
+ - "source"
176
+ min_samples: 1
177
+ max_samples: 10000
178
+ column_types:
179
+ document: "string"
180
+ context: "string"
181
+ description: "Documents with context for Q&A generation"
182
+ ```
183
+
184
+ | Field | Type | Required | Default | Description |
185
+ |-------|------|----------|---------|-------------|
186
+ | `required_columns` | `List[string]` | No | `[]` | Column names that must be present in the input dataset. Flow validation will fail if these are missing. |
187
+ | `optional_columns` | `List[string]` | No | `[]` | Column names that are optional but can enhance flow performance if provided. |
188
+ | `min_samples` | `integer` | No | `1` | Minimum number of samples required in the input dataset. Must be at least 1. |
189
+ | `max_samples` | `integer` | No | `None` | Maximum number of samples to process. Useful for resource management and preventing excessive processing. |
190
+ | `column_types` | `Dict[string, string]` | No | `{}` | Expected data types for specific columns (e.g., "string", "integer", "float"). Used for documentation purposes. |
191
+ | `description` | `string` | No | `""` | Human-readable description of the dataset requirements and expected format. |
192
+
193
+ **Validation Behavior:**
194
+
195
+ - The flow will validate the input dataset against `required_columns` before execution
196
+ - Missing required columns will cause the flow to fail with a clear error message
197
+ - Sample count validation ensures the dataset meets `min_samples` and respects `max_samples` if set
198
+ - `max_samples` must be greater than or equal to `min_samples` if both are specified
199
+
200
+ #### Complete Metadata Example
201
+
202
+ Here's a comprehensive example using all available metadata fields:
203
+
204
+ ```yaml
205
+ metadata:
206
+ name: "Advanced Document Q&A Generation"
207
+ id: "advanced-document-qa-generation"
208
+ description: |
209
+ A sophisticated flow that processes documents to generate high-quality
210
+ question-answer pairs with faithfulness evaluation and quality filtering.
211
+ Designed for educational content and training data generation.
212
+ version: "2.1.0"
213
+ author: "SDG Hub Team"
214
+ license: "Apache-2.0"
215
+
216
+ recommended_models:
217
+ default: "meta-llama/Llama-3.3-70B-Instruct"
218
+ compatible:
219
+ - "microsoft/phi-4"
220
+ - "mistralai/Mixtral-8x7B-Instruct-v0.1"
221
+ - "meta-llama/Llama-3.1-70B-Instruct"
222
+ experimental:
223
+ - "google/gemini-pro"
224
+ - "anthropic/claude-3-opus"
225
+
226
+ tags:
227
+ - "question-generation"
228
+ - "document-processing"
229
+ - "educational"
230
+ - "qa-pairs"
231
+
232
+ dataset_requirements:
233
+ required_columns:
234
+ - "document"
235
+ - "context"
236
+ optional_columns:
237
+ - "domain"
238
+ - "difficulty_level"
239
+ - "source_url"
240
+ min_samples: 10
241
+ max_samples: 5000
242
+ column_types:
243
+ document: "string"
244
+ context: "string"
245
+ domain: "string"
246
+ difficulty_level: "integer"
247
+ description: |
248
+ Input dataset should contain documents with contextual information.
249
+ Each document should be well-formed text suitable for Q&A generation.
250
+ Optional domain and difficulty_level fields help tailor generation.
251
+ ```
120
252
 
121
253
  ### Blocks Section
122
254
 
@@ -572,6 +704,221 @@ Checkpoint directories contain:
572
704
  - If all samples are completed, Flow skips processing and returns merged results immediately
573
705
  - Clean up checkpoint directories manually when no longer needed
574
706
 
707
+ ## 📊 Flow Metrics and Reporting
708
+
709
+ SDG Hub automatically tracks and reports detailed execution metrics for every flow run, providing visibility into performance, data transformations, and success/failure status. This built-in monitoring system helps you understand bottlenecks, debug issues, and optimize your pipelines.
710
+
711
+ ### Automatic Metrics Collection
712
+
713
+ The flow execution system automatically collects comprehensive metrics for each block without any configuration required:
714
+
715
+ **Collected Metrics:**
716
+ - **Block Identification** - Block name and type for clear tracking
717
+ - **Execution Time** - Precise timing for each block's execution
718
+ - **Row Changes** - Input and output row counts to track data filtering
719
+ - **Column Changes** - Added and removed columns to understand data transformations
720
+ - **Status** - Success or failure status for each block
721
+ - **Error Details** - Full error messages and types when blocks fail
722
+
723
+ ### Rich Console Output
724
+
725
+ After every flow execution (whether successful or failed), a beautifully formatted summary table is automatically displayed in your terminal using the Rich library:
726
+
727
+ ```python
728
+ from sdg_hub.core.flow import Flow
729
+ from datasets import Dataset
730
+
731
+ # Load and configure flow
732
+ flow = Flow.from_yaml("path/to/flow.yaml")
733
+ flow.set_model_config(
734
+ model="hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
735
+ api_base="http://localhost:8000/v1"
736
+ )
737
+
738
+ # Execute flow - metrics displayed automatically at completion
739
+ result = flow.generate(dataset)
740
+ ```
741
+
742
+ **Example Console Output:**
743
+
744
+ ```
745
+ ┌─────────────────── Advanced Document Q&A Generation - Complete ───────────────────┐
746
+ │ Flow Execution Summary │
747
+ │ ┌──────────────────────┬─────────────────┬──────────┬──────────────┬─────────┬──┐│
748
+ │ │ Block Name │ Type │ Duration │ Rows │ Columns │ ││
749
+ │ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
750
+ │ │ backup_document │ DuplicateCol... │ 0.05s │ 100 → 100 │ +1 │ ✓││
751
+ │ │ build_question_... │ PromptBuilder...│ 0.12s │ 100 → 100 │ +1 │ ✓││
752
+ │ │ generate_question │ LLMChatBlock │ 45.30s │ 100 → 100 │ +1 │ ✓││
753
+ │ │ generate_answer │ LLMChatBlock │ 78.45s │ 100 → 100 │ +1 │ ✓││
754
+ │ │ eval_faithfulness... │ LLMChatBlock │ 52.20s │ 100 → 100 │ +1 │ ✓││
755
+ │ │ extract_eval_con... │ LLMParserBlock │ 0.15s │ 100 → 100 │ +2 │ ✓││
756
+ │ │ parse_evaluation │ TextParserBlock │ 0.22s │ 100 → 100 │ +2 │ ✓││
757
+ │ │ filter_faithful │ ColumnValueF... │ 0.08s │ 100 → 87 │ — │ ✓││
758
+ │ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
759
+ │ │ TOTAL │ 8 blocks │ 176.57s │ 87 final │ 9 final │ ✓││
760
+ │ └──────────────────────┴─────────────────┴──────────┴──────────────┴─────────┴──┘│
761
+ └─────────────────────────────────────────────────────────────────────────────────────┘
762
+ ```
763
+
764
+ **Table Columns Explained:**
765
+
766
+ | Column | Description |
767
+ |--------|-------------|
768
+ | **Block Name** | The unique name of the block as defined in the flow YAML |
769
+ | **Type** | The block class name (e.g., LLMChatBlock, PromptBuilderBlock) |
770
+ | **Duration** | Execution time in seconds for that specific block |
771
+ | **Rows** | Row transformation showing `input_count → output_count` |
772
+ | **Columns** | Column changes: `+N` for added, `-N` for removed, `+N/-M` for both |
773
+ | **Status** | `✓` for success, `✗` for failure |
774
+
775
+ **Status Indicators:**
776
+
777
+ The panel border color and title reflect the overall execution status:
778
+
779
+ - **Green border + "Complete"** - All blocks executed successfully
780
+ - **Red border + "Failed"** - Flow execution failed (exception thrown)
781
+ - **Yellow border + "Partial"** - Some blocks completed but others failed
782
+
783
+ ### JSON Metrics Export
784
+
785
+ For production workflows, detailed metrics can be automatically saved to JSON files for analysis, monitoring, and debugging:
786
+
787
+ ```python
788
+ # Enable JSON metrics export by providing a log directory
789
+ result = flow.generate(
790
+ dataset,
791
+ log_dir="./flow_logs"
792
+ )
793
+
794
+ # Metrics automatically saved to: ./flow_logs/{flow_name}_{timestamp}_metrics.json
795
+ ```
796
+
797
+ **JSON Structure:**
798
+
799
+ ```json
800
+ {
801
+ "flow_name": "Advanced Document Q&A Generation",
802
+ "flow_version": "2.1.0",
803
+ "execution_timestamp": "20250113_143052",
804
+ "execution_successful": true,
805
+ "total_execution_time": 176.57,
806
+ "total_wall_time": 178.23,
807
+ "total_blocks": 8,
808
+ "successful_blocks": 8,
809
+ "failed_blocks": 0,
810
+ "block_metrics": [
811
+ {
812
+ "block_name": "backup_document",
813
+ "block_type": "DuplicateColumnsBlock",
814
+ "execution_time": 0.05,
815
+ "input_rows": 100,
816
+ "output_rows": 100,
817
+ "added_cols": ["original_document"],
818
+ "removed_cols": [],
819
+ "status": "success"
820
+ },
821
+ {
822
+ "block_name": "generate_question",
823
+ "block_type": "LLMChatBlock",
824
+ "execution_time": 45.30,
825
+ "input_rows": 100,
826
+ "output_rows": 100,
827
+ "added_cols": ["question"],
828
+ "removed_cols": [],
829
+ "status": "success"
830
+ }
831
+ ]
832
+ }
833
+ ```
834
+
835
+ **JSON Fields Reference:**
836
+
837
+ | Field | Type | Description |
838
+ |-------|------|-------------|
839
+ | `flow_name` | string | Human-readable flow name from metadata |
840
+ | `flow_version` | string | Flow version string |
841
+ | `execution_timestamp` | string | Timestamp when execution started (YYYYMMDD_HHMMSS format) |
842
+ | `execution_successful` | boolean | `true` if all blocks succeeded, `false` if any failed |
843
+ | `total_execution_time` | float | Sum of all block execution times in seconds |
844
+ | `total_wall_time` | float | End-to-end wall clock time including overhead |
845
+ | `total_blocks` | integer | Number of blocks in the flow |
846
+ | `successful_blocks` | integer | Count of blocks that executed successfully |
847
+ | `failed_blocks` | integer | Count of blocks that failed |
848
+ | `block_metrics` | array | Detailed metrics for each block (see below) |
849
+
850
+ **Block Metrics Fields:**
851
+
852
+ | Field | Type | Description |
853
+ |-------|------|-------------|
854
+ | `block_name` | string | Unique block identifier |
855
+ | `block_type` | string | Block class name |
856
+ | `execution_time` | float | Block execution duration in seconds |
857
+ | `input_rows` | integer | Number of rows received by the block |
858
+ | `output_rows` | integer | Number of rows produced by the block |
859
+ | `added_cols` | array | List of column names added by this block |
860
+ | `removed_cols` | array | List of column names removed by this block |
861
+ | `status` | string | `"success"` or `"failed"` |
862
+ | `error` | string | Error message (only present if `status` is `"failed"`) |
863
+ | `error_type` | string | Error class name (only present if `status` is `"failed"`) |
864
+
865
+ ### Metrics Aggregation
866
+
867
+ When using checkpointing with `save_freq`, blocks may execute multiple times on different chunks of data. The metrics system automatically aggregates these executions per block:
868
+
869
+ - **Execution times** are summed across all chunks
870
+ - **Row counts** are totaled for input and output
871
+ - **Column changes** are merged (duplicates removed)
872
+ - **Status** reflects the worst case (any failure marks the block as failed)
873
+
874
+ This ensures the metrics summary and JSON export always show a cohesive view of the entire flow execution.
875
+
876
+ ### Use Cases
877
+
878
+ **Performance Optimization:**
879
+ ```python
880
+ # Identify slow blocks for optimization
881
+ result = flow.generate(dataset, log_dir="./optimization_analysis")
882
+ # Review metrics JSON to find blocks with high execution_time
883
+ ```
884
+
885
+ **Data Quality Monitoring:**
886
+ ```python
887
+ # Track how filtering affects dataset size
888
+ result = flow.generate(dataset)
889
+ # Check console output for row count changes: "100 → 87" indicates 13 filtered
890
+ ```
891
+
892
+ **Production Monitoring:**
893
+ ```python
894
+ # Continuous metrics collection for production pipelines
895
+ for batch in data_batches:
896
+ result = flow.generate(
897
+ batch,
898
+ log_dir=f"./production_metrics/{date}",
899
+ checkpoint_dir=f"./checkpoints/{batch_id}"
900
+ )
901
+ # Aggregate metrics JSON files for dashboards and alerting
902
+ ```
903
+
904
+ **Debugging Failed Runs:**
905
+ ```python
906
+ # Automatic error capture in metrics
907
+ try:
908
+ result = flow.generate(dataset, log_dir="./debug_logs")
909
+ except Exception as e:
910
+ # Metrics JSON contains full error details for failed blocks
911
+ print(f"Check ./debug_logs for detailed failure metrics")
912
+ ```
913
+
914
+ ### Important Notes
915
+
916
+ - **Always Displayed** - Metrics are shown even if the flow fails, helping debug issues
917
+ - **Zero Configuration** - No setup required, metrics collection is automatic
918
+ - **Minimal Overhead** - Metrics collection adds negligible performance impact
919
+ - **Thread-Safe** - Metrics are properly collected during concurrent block execution
920
+ - **Checkpoint Aware** - Metrics correctly aggregate across checkpointed chunks
921
+
575
922
  ## 🚀 Next Steps
576
923
 
577
924
  Ready to master the flow system? Explore these detailed guides:
@@ -359,7 +359,7 @@
359
359
  "processed_knowledge_dataset = processed_knowledge_dataset.remove_columns(['messages']).rename_column('messages_without_think', 'messages')\n",
360
360
  "\n",
361
361
  "cfg = RAFTConfig(k_passages=5, max_tokens_per_chunk=400, p_include_oracle=0.9)\n",
362
- "raft_samples = build_raft_samples(ds, cfg)\n",
362
+ "raft_samples = build_raft_samples(processed_knowledge_dataset, cfg)\n",
363
363
  "raft_samples = raft_samples.map(build_messages).remove_columns(['question', 'context', 'oracle_context', 'cot_answer', 'answer', 'instruction', 'type', 'meta'])\n",
364
364
  "\n",
365
365
  "fp = \"<Instruction/Skills dataset>\" # TODO: Replace with huggingface dataset path once its uploaded\n",
@@ -237,20 +237,13 @@ def build_messages(raft_record: Dict[str, Any]):
237
237
  Output:
238
238
  messages: list of {"role": "system"|"user"|"assistant", "content": str}
239
239
  """
240
- # 1. System message
241
- sys_msg = raft_record.get("instruction") or (
242
- "You are a domain expert. You must answer questions by first quoting a span "
243
- "verbatim from the relevant passage, then giving reasoning, then the final answer. "
244
- "Ignore distractor passages."
245
- )
246
-
247
- # 2. User message: serialize passages + question
240
+ # 1. User message: serialize passages + question
248
241
  passages = "\n\n".join(
249
242
  [f"[Passage {i+1}] {p}" for i, p in enumerate(raft_record["context"])]
250
243
  )
251
244
  user_msg = f"Passages:\n{passages}\n\nQuestion: {raft_record['question']}"
252
245
 
253
- # 3. Assistant message: the gold output
246
+ # 2. Assistant message: the gold output
254
247
  assistant_msg = raft_record["answer"]
255
248
 
256
249
  return {"messages" : [
@@ -30,7 +30,7 @@ classifiers = [
30
30
  # Core dependencies moved from requirements.txt
31
31
  dependencies = [
32
32
  "click>=8.1.7,<9.0.0",
33
- "datasets>=2.18.0,<4.0.0",
33
+ "datasets>=4.0.0",
34
34
  "httpx>=0.25.0,<1.0.0",
35
35
  "jinja2",
36
36
  "litellm>=1.73.0,<1.75.0",