sdg-hub 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/integration-test.yml +4 -1
  2. {sdg_hub-0.4.1/src/sdg_hub.egg-info → sdg_hub-0.4.2}/PKG-INFO +1 -1
  3. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/concepts.md +14 -1
  4. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/flows/discovery.md +38 -1
  5. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/flows/overview.md +35 -0
  6. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/quick-start.md +6 -3
  7. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +4 -1
  8. sdg_hub-0.4.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +214 -0
  9. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +72 -43
  10. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +57 -237
  11. sdg_hub-0.4.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +259 -0
  12. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +2 -2
  13. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/_version.py +3 -3
  14. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/base.py +139 -2
  15. sdg_hub-0.4.2/src/sdg_hub/core/utils/__init__.py +21 -0
  16. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_metrics.py +116 -0
  17. sdg_hub-0.4.2/src/sdg_hub/core/utils/time_estimator.py +344 -0
  18. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +11 -9
  19. {sdg_hub-0.4.1 → sdg_hub-0.4.2/src/sdg_hub.egg-info}/PKG-INFO +1 -1
  20. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/SOURCES.txt +6 -0
  21. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_base.py +75 -2
  22. sdg_hub-0.4.2/tests/flow/test_time_estimation.py +546 -0
  23. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +1 -0
  24. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +7 -1
  25. sdg_hub-0.4.2/tests/utils/test_flow_metrics.py +477 -0
  26. sdg_hub-0.4.1/src/sdg_hub/core/utils/__init__.py +0 -13
  27. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/actionlint.yaml +0 -0
  28. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/actions/free-disk-space/action.yml +0 -0
  29. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/dependabot.yml +0 -0
  30. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/mergify.yml +0 -0
  31. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/actionlint.dockerfile +0 -0
  32. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/actionlint.yml +0 -0
  33. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/docs.yml +0 -0
  34. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/e2e.yml +0 -0
  35. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/lint.yml +0 -0
  36. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/matchers/actionlint.json +0 -0
  37. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/matchers/pylint.json +0 -0
  38. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/packer.yml +0 -0
  39. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/pypi.yaml +0 -0
  40. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.github/workflows/test.yml +0 -0
  41. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.gitignore +0 -0
  42. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.isort.cfg +0 -0
  43. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.markdownlint-cli2.yaml +0 -0
  44. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.pre-commit-config.yaml +0 -0
  45. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/.pylintrc +0 -0
  46. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/CLAUDE.md +0 -0
  47. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/CONTRIBUTING.md +0 -0
  48. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/LICENSE +0 -0
  49. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/Makefile +0 -0
  50. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/README.md +0 -0
  51. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/.nojekyll +0 -0
  52. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/README.md +0 -0
  53. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/_coverpage.md +0 -0
  54. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/_navbar.md +0 -0
  55. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/_sidebar.md +0 -0
  56. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/api-reference.md +0 -0
  57. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/blocks/custom-blocks.md +0 -0
  58. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/blocks/filtering-blocks.md +0 -0
  59. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/blocks/llm-blocks.md +0 -0
  60. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/blocks/overview.md +0 -0
  61. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/blocks/transform-blocks.md +0 -0
  62. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/development.md +0 -0
  63. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/index.html +0 -0
  64. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/docs/installation.md +0 -0
  65. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/annotation/annotation_classification.ipynb +0 -0
  66. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
  67. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/annotation/news_classification_flow.yaml +0 -0
  68. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/annotation/news_classification_prompt.yaml +0 -0
  69. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
  70. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  71. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  72. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  73. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/README.md +0 -0
  74. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  75. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  76. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  77. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  78. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  79. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  80. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  81. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  82. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  83. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  84. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/knowledge_tuning/knowledge_utils.py +0 -0
  85. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/text_analysis/README.md +0 -0
  86. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  87. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
  88. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/pyproject.toml +0 -0
  89. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/scripts/ruff.sh +0 -0
  90. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/setup.cfg +0 -0
  91. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/__init__.py +0 -0
  92. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/__init__.py +0 -0
  93. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/__init__.py +0 -0
  94. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/base.py +0 -0
  95. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -0
  96. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -0
  97. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -0
  98. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -0
  99. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -0
  100. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -0
  101. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -0
  102. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -0
  103. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -0
  104. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -0
  105. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  106. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  107. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  108. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  109. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
  110. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
  111. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
  112. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  113. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
  114. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/registry.py +0 -0
  115. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  116. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  117. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  118. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
  119. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  120. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
  121. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  122. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  123. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/__init__.py +0 -0
  124. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  125. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/metadata.py +0 -0
  126. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/migration.py +0 -0
  127. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/registry.py +0 -0
  128. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/validation.py +0 -0
  129. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/datautils.py +0 -0
  130. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/error_handling.py +0 -0
  131. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  132. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  133. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/logger_config.py +0 -0
  134. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  135. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  136. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  137. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  138. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  139. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -0
  140. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  141. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
  142. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  143. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  144. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -0
  145. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  146. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  147. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  148. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  149. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -0
  150. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  151. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  152. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  153. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  154. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  155. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  156. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  157. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  158. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  159. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -0
  160. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  161. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  162. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  163. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  164. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  165. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  166. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  167. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  168. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  169. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  170. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  171. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  172. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
  173. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  174. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub/py.typed +0 -0
  175. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  176. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/requires.txt +0 -0
  177. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
  178. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/__init__.py +0 -0
  179. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/deprecated/test_llmblock.py +0 -0
  180. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  181. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  182. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
  183. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_parser_block.py +0 -0
  184. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  185. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/llm/test_textparserblock.py +0 -0
  186. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/test_base_block.py +0 -0
  187. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/test_registry.py +0 -0
  188. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_config.yaml +0 -0
  189. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  190. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  191. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  192. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  193. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  194. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  195. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/transform/test_json_structure_block.py +0 -0
  196. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/transform/test_melt_columns.py +0 -0
  197. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/transform/test_text_concat.py +0 -0
  198. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
  199. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
  200. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
  201. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
  202. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_renameblock.py +0 -0
  203. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
  204. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
  205. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_settomajority.py +0 -0
  206. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/__init__.py +0 -0
  207. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/conftest.py +0 -0
  208. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_checkpointer.py +0 -0
  209. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_dataset_requirements.py +0 -0
  210. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_integration.py +0 -0
  211. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_metadata.py +0 -0
  212. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_migration.py +0 -0
  213. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_registry.py +0 -0
  214. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/flow/test_validation.py +0 -0
  215. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/README.md +0 -0
  216. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/__init__.py +0 -0
  217. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  218. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  219. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  220. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/utils/test_datautils.py +0 -0
  221. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/utils/test_error_handling.py +0 -0
  222. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tests/utils/test_path_resolution.py +0 -0
  223. {sdg_hub-0.4.1 → sdg_hub-0.4.2}/tox.ini +0 -0
@@ -20,6 +20,7 @@ on:
20
20
  branches:
21
21
  - "main"
22
22
  - "release-**"
23
+ types: [opened, synchronize, reopened, labeled]
23
24
  paths:
24
25
  # Only trigger on changes to relevant flows and examples (EXTEND THIS):
25
26
  - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
@@ -47,11 +48,13 @@ jobs:
47
48
  # Require manual approval before running (via GitHub Environment)
48
49
  environment: integration-tests
49
50
  # Skip fork PRs (they can't access environment secrets anyway)
51
+ # Also check for 'run-integration-tests' label on labeled events
50
52
  if: |
51
53
  github.event_name == 'workflow_dispatch' ||
52
54
  github.event_name == 'push' ||
53
55
  (github.event_name == 'pull_request' &&
54
- github.event.pull_request.head.repo.full_name == github.repository)
56
+ github.event.pull_request.head.repo.full_name == github.repository &&
57
+ (github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
55
58
  strategy:
56
59
  matrix:
57
60
  python:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -148,9 +148,22 @@ Every block validates data at runtime:
148
148
  ## 🚀 Best Practices
149
149
 
150
150
  ### 1. Start Small
151
- - Use `dry_run()` to test with small samples
151
+ - Use `dry_run()` to test with small samples before processing full datasets
152
+ - Add `enable_time_estimation=True` to predict execution time for the complete dataset
152
153
  - Validate your pipeline before scaling up
153
154
 
155
+ ```python
156
+ # Test AND estimate in one call
157
+ result = flow.dry_run(dataset, sample_size=5, enable_time_estimation=True, max_concurrency=100)
158
+
159
+ # Access dry run results
160
+ print(f"Tested with {result['sample_size']} samples")
161
+ print(f"Output columns: {result['final_dataset']['columns']}")
162
+
163
+ # Time estimation is automatically displayed in a Rich table format
164
+ # No need to access it programmatically - the table shows all estimation details
165
+ ```
166
+
154
167
  ### 2. Layer Validation
155
168
  - Use basic block composition (PromptBuilder → LLMChat → Parser → Filter) to assess quality
156
169
  - Implement filtering to maintain data standards
@@ -67,7 +67,44 @@ for flow_name in all_flows:
67
67
 
68
68
  ### Getting Flow Information
69
69
 
70
- #TODO: Add flow info example
70
+ Access detailed flow metadata and configuration:
71
+
72
+ ```python
73
+ from sdg_hub.core.flow import FlowRegistry, Flow
74
+
75
+ # Get metadata for a specific flow
76
+ flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
77
+ metadata = FlowRegistry.get_flow_metadata(flow_name)
78
+
79
+ if metadata:
80
+ print(f"Flow: {metadata.name}")
81
+ print(f"Version: {metadata.version}")
82
+ print(f"Author: {metadata.author}")
83
+ print(f"Description: {metadata.description}")
84
+ print(f"Tags: {', '.join(metadata.tags)}")
85
+ print(f"Recommended model: {metadata.recommended_models.get('default', 'Not specified')}")
86
+
87
+ # Load flow and get detailed information
88
+ flow_path = FlowRegistry.get_flow_path(flow_name)
89
+ flow = Flow.from_yaml(flow_path)
90
+
91
+ # Get comprehensive flow info
92
+ info = flow.get_info()
93
+ print(f"Total blocks: {info['total_blocks']}")
94
+ print(f"Block sequence: {', '.join(info['block_names'])}")
95
+
96
+ # Get dataset requirements
97
+ requirements = flow.get_dataset_requirements()
98
+ if requirements:
99
+ print(f"Required columns: {requirements.required_columns}")
100
+ print(f"Description: {requirements.description}")
101
+ print(f"Min samples: {requirements.min_samples}")
102
+
103
+ # Get model recommendations
104
+ recommendations = flow.get_model_recommendations()
105
+ print(f"Default model: {recommendations.get('default')}")
106
+ print(f"Compatible models: {recommendations.get('compatible', [])}")
107
+ ```
71
108
 
72
109
  ### Getting Flow Paths
73
110
 
@@ -292,6 +292,41 @@ print(f"Output columns: {dry_result['final_dataset']['columns']}")
292
292
  print(f"Sample output: {dry_result['sample_output']}")
293
293
  ```
294
294
 
295
+ ### Time Estimation
296
+
297
+ Predict execution time for your full dataset before running:
298
+
299
+ ```python
300
+ # Get dry run results AND time estimation in one call
301
+ result = flow.dry_run(
302
+ dataset,
303
+ sample_size=5,
304
+ enable_time_estimation=True,
305
+ max_concurrency=100
306
+ )
307
+
308
+ # Time estimation is automatically displayed in a Rich table format
309
+ # The table shows estimated time, total API requests, and per-block breakdowns
310
+ print(f"Dry run completed with {result['sample_size']} samples")
311
+ print(f"Output columns: {result['final_dataset']['columns']}")
312
+ ```
313
+
314
+ **How It Works:**
315
+
316
+ The estimation uses 2 dry runs to accurately predict execution time:
317
+ - Extracts startup overhead (one-time costs)
318
+ - Calculates per-sample throughput (variable costs)
319
+ - Uses linear regression to separate fixed from variable costs
320
+
321
+ **Accuracy:**
322
+ - Includes a 20% conservative buffer to account for API variability
323
+ - Typical accuracy: within 15-40% of actual runtime depending on workload
324
+ - Better to finish early than run over time!
325
+
326
+ **When to Use:**
327
+ - Before processing with your full dataset
328
+ - To identify bottleneck blocks and optimize your pipeline
329
+
295
330
  ### Runtime Parameters
296
331
 
297
332
  Runtime parameters allow you to customize block behavior at execution time without modifying flow YAML files. You can override global parameters for all blocks or configure specific blocks individually.
@@ -62,11 +62,14 @@ dataset = Dataset.from_dict({
62
62
  'icl_response_3': ['Java provides platform independence and strong object-oriented features.']
63
63
  })
64
64
 
65
- # Test with a small sample first (recommended!)
66
- print("🧪 Running dry run...")
67
- dry_result = flow.dry_run(dataset, sample_size=1)
65
+ # Test with a small sample AND get time estimate (recommended!)
66
+ print("🧪 Running dry run with time estimation...")
67
+ dry_result = flow.dry_run(dataset, sample_size=5, enable_time_estimation=True, max_concurrency=100)
68
68
  print(f"✅ Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
69
69
  print(f"📊 Output columns: {list(dry_result['final_dataset']['columns'])}")
70
+
71
+ # Time estimation is automatically displayed in a Rich table format
72
+ # The table shows estimated time, total API calls, and per-block breakdowns
70
73
  ```
71
74
 
72
75
  ## 📊 Step 3: Generate Synthetic Data
@@ -16,6 +16,7 @@ VLLM_MODEL=hosted_vllm/meta-llama/Llama-3.3-70B-Instruct
16
16
  VLLM_API_BASE=http://localhost:8000/v1
17
17
  VLLM_API_KEY=EMPTY
18
18
  ENABLE_REASONING=false
19
+ MAX_CONCURRENCY=50
19
20
  # =============================================================================
20
21
  # OPENAI CONFIGURATION
21
22
  # =============================================================================
@@ -38,7 +39,9 @@ MAAS_API_KEY=your-maas-api-key-here
38
39
  # =============================================================================
39
40
  # DATA CONFIGURATION
40
41
  # =============================================================================
41
- SEED_DATA_PATH=seed_data_val.jsonl
42
+ SEED_DATA_PATH=seed_data.jsonl
43
+ # Set this for subsampling the seed data. Useful for debugging or running validation
44
+ SEED_DATA_SUBSAMPLE=24
42
45
  OUTPUT_DATA_FOLDER=output_data
43
46
  RUN_ON_VALIDATION_SET=true
44
47
  NUMBER_OF_SUMMARIES=50
@@ -0,0 +1,214 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "83f458de",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Document Pre-processing for Knowledge Tuning\n",
9
+ "\n",
10
+ "## Overview\n",
11
+ "\n",
12
+ "This notebook demonstrates a complete document preprocessing pipeline designed specifically for **knowledge tuning** with sdg-hub. \n",
13
+ "\n",
14
+ "## What This Notebook Does\n",
15
+ "\n",
16
+ "This preprocessing pipeline transforms raw documents (PDFs, Word docs, etc.) into seed data for data generation:\n",
17
+ "\n",
18
+ "1. **Document Parsing**: Converts raw documents to structured markdown format\n",
19
+ "2. **Chunking**: Splits documents into manageable chunks while preserving structure and context\n",
20
+ "3. **Seed Data Creation**: Formats chunks with in-context learning (ICL) templates for effective knowledge tuning\n",
21
+ "\n",
22
+ "## Prerequisites\n",
23
+ "\n",
24
+ "- We will use the existing InstructLab document parser (`docparser_v2.py`) and Document parsing configuration (`docling_v2_config.yaml`)\n",
25
+ "- Raw pdf documents in the `document_collection/` directory\n"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "daa22c74",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Step 1: Document Processing Pipeline\n",
36
+ "# Define the directory containing raw documents to be processed\n",
37
+ "data_dir = 'document_collection/'\n",
38
+ "\n",
39
+ "# Run the document parser to convert documents to markdown\n",
40
+ "# - input-dir: Directory containing source documents\n",
41
+ "# - output-dir: Directory where processed markdown files will be saved\n",
42
+ "# - c: Configuration file specifying parsing parameters\n",
43
+ "!python ../instructlab/docparser_v2.py --input-dir {data_dir} --output-dir {data_dir} -c ../instructlab/docling_v2_config.yaml"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "id": "295749b5",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "# Step 2: Install Required Dependencies\n",
54
+ "# Install packages needed for document processing and text chunking\n",
55
+ "\n",
56
+ "%pip install docling markdown-it-py\n",
57
+ "%pip install --upgrade transformers"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "dd8a4a2a",
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# Step 3: Load Processed Document\n",
68
+ "import glob\n",
69
+ "\n",
70
+ "# In our example above docling step produces markdown of all the pdf files in the document_collection\n",
71
+ "with open(glob.glob(f'{data_dir}/*.md')[0], 'r') as f:\n",
72
+ " text = f.read()"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "7614dc73",
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "# Step 4: Text Chunking and Dataset Creation\n",
83
+ "\n",
84
+ "from markdown_it import MarkdownIt \n",
85
+ "from typing import List\n",
86
+ "import datasets \n",
87
+ "\n",
88
+ "\n",
89
+ "def chunk_markdown(\n",
90
+ " text: str,\n",
91
+ " max_tokens: int = 200,\n",
92
+ " overlap: int = 50\n",
93
+ ") -> List[str]:\n",
94
+ " \"\"\"\n",
95
+ " Splits Markdown text into chunks at block-level elements\n",
96
+ " (headings, paragraphs, lists, tables, code, blockquotes).\n",
97
+ " Adds overlap (in words) between all consecutive chunks.\n",
98
+ " \n",
99
+ " Args:\n",
100
+ " text: The markdown text to be chunked\n",
101
+ " max_tokens: Maximum number of words per chunk\n",
102
+ " overlap: Number of overlapping words between consecutive chunks\n",
103
+ " \n",
104
+ " Returns:\n",
105
+ " List of text chunks with specified overlap\n",
106
+ " \"\"\"\n",
107
+ "\n",
108
+ " # Initialize markdown parser to understand document structure\n",
109
+ " md = MarkdownIt()\n",
110
+ " tokens = md.parse(text)\n",
111
+ "\n",
112
+ " # Group tokens into block-level segments to preserve markdown structure\n",
113
+ " # This ensures we don't split in the middle of headings, lists, etc.\n",
114
+ " blocks = []\n",
115
+ " buf = []\n",
116
+ " for tok in tokens:\n",
117
+ " if tok.block and tok.type.endswith(\"_open\"):\n",
118
+ " buf = []\n",
119
+ " elif tok.block and tok.type.endswith(\"_close\"):\n",
120
+ " if buf:\n",
121
+ " blocks.append(\"\\n\".join(buf).strip())\n",
122
+ " buf = []\n",
123
+ " elif tok.content:\n",
124
+ " buf.append(tok.content)\n",
125
+ " if buf:\n",
126
+ " blocks.append(\"\\n\".join(buf).strip())\n",
127
+ "\n",
128
+ " # Split blocks into chunks with overlap to maintain context continuity\n",
129
+ " chunks = []\n",
130
+ " current_words = []\n",
131
+ " for block in blocks:\n",
132
+ " words = block.split()\n",
133
+ " for w in words:\n",
134
+ " current_words.append(w)\n",
135
+ " if len(current_words) >= max_tokens:\n",
136
+ " # Emit a complete chunk\n",
137
+ " chunks.append(\" \".join(current_words))\n",
138
+ " # Prepare next buffer with overlap from the end of this chunk\n",
139
+ " # This ensures context continuity between chunks\n",
140
+ " current_words = current_words[-overlap:] if overlap > 0 else []\n",
141
+ "\n",
142
+ " # Add any remaining words as the final chunk\n",
143
+ " if current_words:\n",
144
+ " chunks.append(\" \".join(current_words))\n",
145
+ "\n",
146
+ " return chunks\n",
147
+ "\n",
148
+ "\n",
149
+ "chunks = chunk_markdown(text, max_tokens=5000, overlap=1000)\n",
150
+ "\n",
151
+ "\n",
152
+ "# Prepare seed data for the SDG-Hub knowledge pipeline.\n",
153
+ "# \n",
154
+ "# The seed data requires the following fields:\n",
155
+ "# - document_outline: A concise title or summary that accurately represents the entire document.\n",
156
+ "# For documents covering multiple themes, consider providing multiple outlines (one per section).\n",
157
+ "# - icl_document: A representative sample extract from the document. This may include tables, code snippets, definitions, etc.\n",
158
+ "# - icl_query_1, icl_query_2, icl_query_3: Three questions based on the icl_document sample.\n",
159
+ "# - domain: The domain or subject area of the document.\n",
160
+ "#\n",
161
+ "# The code below creates a HuggingFace Dataset from the document chunks,\n",
162
+ "# then maps the required ICL fields to each entry, and finally saves the result as a JSONL file.\n",
163
+ "\n",
164
+ "seed_data = datasets.Dataset.from_dict({'document': chunks})\n",
165
+ "\n",
166
+ "icl = {\n",
167
+ " \"document_outline\": \"The document contains excerpts from FINTRAC regulations designed to combat money laundering and terrorist financing in Canada\",\n",
168
+ " \"icl_document\": \"## Overview\\n\\nThis guidance came into effect on June 1, 2021.\\n\\n\\nThis guidance explains the methods that can be used by reporting entities\\n(REs) to verify the identity of a person or an entity.\\n\\n\\n## 1. Meaning of verifying the identity of a person or an entity\\n\\nIt means to use the methods described in this guidance to ensure that the\\ninformation in an identification document or from other informational\\nsources matches the information that the person or entity provided.\\n\\n\\nVerifying identity is a foundational element of Canada's anti-money\\nlaundering and anti-terrorist financing regime and a key component of an\\nRE's relationship with clients. It helps you to know your clients and to\\nunderstand and assess any risk that may be associated to their\\ntransactions or activities.\\n\\n\\n## 2. How to verify the identity of a person\\n\\nYou can use any of the 5 methods described below to identify a person:\\n\\n- 2.1 Government-issued photo identification method\\n\\n- 2.2 Credit file method\\n\\n- 2.3 Dual-process method\\n\\n- 2.4 Affiliate or member method\\n\\n- 2.5 Reliance method\\n\",\n",
169
+ " \"icl_query_1\": \"In Canada, what are the methods for verifying someone's identity?\",\n",
170
+ " \"icl_query_2\": \"In Canada, why is it important to confirm a client's identity?\",\n",
171
+ " \"icl_query_3\": \"In Canada, can I use Reliance method to verify identity of a person?\",\n",
172
+ " \"domain\": \"Finance\"\n",
173
+ "}\n",
174
+ "\n",
175
+ "# Map the ICL fields to each document chunk (if you want to use the same ICL for all, as shown here)\n",
176
+ "seed_data = seed_data.map(lambda x: icl)\n",
177
+ "\n",
178
+ "# Save the seed data to a JSONL file for downstream use\n",
179
+ "seed_data.to_json('seed_data.jsonl', orient='records', lines=True)"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "markdown",
184
+ "id": "44f3ff7f",
185
+ "metadata": {},
186
+ "source": [
187
+ "### Next Steps:\n",
188
+ "- The seed_data.jsonl file is now ready for the knowledge tuning pipeline.\n",
189
+ "- You can now refer to the [knowledge generation](knowledge_generation.ipynb) notebook"
190
+ ]
191
+ }
192
+ ],
193
+ "metadata": {
194
+ "kernelspec": {
195
+ "display_name": "sdg_hub",
196
+ "language": "python",
197
+ "name": "python3"
198
+ },
199
+ "language_info": {
200
+ "codemirror_mode": {
201
+ "name": "ipython",
202
+ "version": 3
203
+ },
204
+ "file_extension": ".py",
205
+ "mimetype": "text/x-python",
206
+ "name": "python",
207
+ "nbconvert_exporter": "python",
208
+ "pygments_lexer": "ipython3",
209
+ "version": "3.11.12"
210
+ }
211
+ },
212
+ "nbformat": 4,
213
+ "nbformat_minor": 5
214
+ }
@@ -4,14 +4,44 @@
4
4
  "cell_type": "markdown",
5
5
  "metadata": {},
6
6
  "source": [
7
- "### Install SDG\n",
7
+ "# Enhanced Summary Knowledge Tuning - Data Generation\n",
8
+ "\n",
9
+ "## Overview\n",
10
+ "\n",
11
+ "This notebook demonstrates how to generate high-quality knowledge tuning datasets using the SDG Hub framework. It creates multiple types of document augmentations and corresponding question-answer pairs that can be used to train or fine-tune language models for enhanced summarization and knowledge extraction capabilities.\n",
12
+ "\n",
13
+ "## What This Notebook Does\n",
14
+ "\n",
15
+ "This notebook will:\n",
16
+ "\n",
17
+ "2. **Generate Four Types of Knowledge Tuning Datasets**:\n",
18
+ " - **Extractive Summaries**: Concise summaries that extract key information directly from source documents\n",
19
+ " - **Detailed Summaries**: Comprehensive summaries that provide thorough coverage of document content\n",
20
+ " - **Key Facts**: Structured fact extraction with corresponding Q&A pairs\n",
21
+ " - **Document-Based Q&A**: Question-answer pairs generated directly from document content\n",
22
+ "\n",
23
+ "\n",
24
+ "4. **Output Structured Training Data**:\n",
25
+ " - For each augmentation we save JSONL dataset.\n",
26
+ " - You can follow [knowledge_mixing](knowledge_mixing.ipynb) to convert it into training dataset\n",
27
+ "\n",
28
+ "## Prerequisites\n",
29
+ "\n",
30
+ "- SDG Hub installed and configured\n",
31
+ "- Environment variables set up (see [.env.example](.env.example)). Specifically set the model provider, seed data and output path.\n",
32
+ "- Document pre-processing completed (run [document_pre_processing.ipynb](document_pre_processing.ipynb) first)\n",
33
+ "\n",
8
34
  "```bash \n",
9
35
  "git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git\n",
10
36
  "cd sdg_hub\n",
11
37
  "pip install .[examples]\n",
12
38
  "copy the .env.example to .env and set the model endpoint and generation/mixing parameters\n",
13
39
  "```\n",
14
- "**⚠️ If you haven't already, run the document pre-processing notebook to create the seed data.**"
40
+ "**⚠️ If you haven't already, run the document pre-processing notebook to create the seed data.**\n",
41
+ "\n",
42
+ "## Next Steps\n",
43
+ "\n",
44
+ "After running this notebook, use [knowledge_mixing](knowledge_mixing.ipynb) to combine and curate the generated datasets for final model training.\n"
15
45
  ]
16
46
  },
17
47
  {
@@ -50,7 +80,7 @@
50
80
  "metadata": {},
51
81
  "outputs": [],
52
82
  "source": [
53
- "def create_seed_data(run_on_validation=None, seed_data_path=None):\n",
83
+ "def create_seed_data_from_quality_benchmark(run_on_validation=None, seed_data_path=None):\n",
54
84
  " \"\"\"\n",
55
85
  " Create seed data from QuALITY Benchmark dataset.\n",
56
86
  " \n",
@@ -135,8 +165,20 @@
135
165
  "metadata": {},
136
166
  "outputs": [],
137
167
  "source": [
138
- "# Create seed data using the function\n",
139
- "quality_corpus = create_seed_data().select(range(1))"
168
+ "# Load seed data. If one is not provided, create it from the quality benchmark dataset.\n",
169
+ "seed_data_path = os.getenv('SEED_DATA_PATH', 'seed_data.jsonl')\n",
170
+ "\n",
171
+ "if not os.path.exists(seed_data_path):\n",
172
+ " print(f\"{seed_data_path} not found. Creating seed data...\")\n",
173
+ " quality_corpus = create_seed_data_from_quality_benchmark(seed_data_path=seed_data_path)\n",
174
+ "else:\n",
175
+ " print(f\"Loading existing seed data from {seed_data_path}\")\n",
176
+ " quality_corpus = load_dataset('json', data_files=seed_data_path, split='train')\n",
177
+ "\n",
178
+ "# Subsample the seed data. Useful for debugging.\n",
179
+ "subsample = int(os.getenv('SEED_DATA_SUBSAMPLE', '0'))\n",
180
+ "if subsample > 0:\n",
181
+ " quality_corpus = quality_corpus.select(range(subsample))"
140
182
  ]
141
183
  },
142
184
  {
@@ -229,23 +271,12 @@
229
271
  "execution_count": null,
230
272
  "metadata": {},
231
273
  "outputs": [],
232
- "source": [
233
- "# We will use below mapping of flow names to their respective summarization flows\n",
234
- "flow_name_map = {\n",
235
- " 'Detailed Summary Knowledge Tuning Dataset Generation Flow': 'gen_detailed_summary',\n",
236
- " 'Extractive Summary Knowledge Tuning Dataset Generation Flow': 'gen_extractive_summary',\n",
237
- " }"
238
- ]
239
- },
240
- {
241
- "cell_type": "code",
242
- "execution_count": 9,
243
- "metadata": {},
244
- "outputs": [],
245
274
  "source": [
246
275
  "# Get runtime parameters\n",
247
276
  "enable_reasoning = os.getenv('ENABLE_REASONING', 'false').lower() in ('1', 'true', 'yes')\n",
248
- "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))"
277
+ "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
278
+ "max_concurrency = int(os.getenv('MAX_CONCURRENCY', '50'))\n",
279
+ "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')"
249
280
  ]
250
281
  },
251
282
  {
@@ -263,21 +294,21 @@
263
294
  "flow = set_model_config(flow)\n",
264
295
  "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
265
296
  "# Generate data for extractive summary\n",
266
- "runtime_params = {\n",
267
- " flow_name_map[flow_name]: {\n",
268
- " 'n': number_of_summaries\n",
269
- " },\n",
270
- "}\n",
271
297
  "if enable_reasoning:\n",
272
298
  " # Increase max tokens to accommodate reasoning content\n",
273
299
  " runtime_params = {\n",
274
300
  " 'question_generation': {'max_tokens': 1024}, \n",
275
- " flow_name_map[flow_name]: {'n': number_of_summaries, 'max_tokens': 6000}\n",
301
+ " 'gen_extractive_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
276
302
  " }\n",
277
- " \n",
303
+ "else:\n",
304
+ " runtime_params = {\n",
305
+ " 'gen_extractive_summary': {\n",
306
+ " 'n': number_of_summaries\n",
307
+ " }\n",
308
+ "}\n",
309
+ "\n",
310
+ "extractive_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
278
311
  "\n",
279
- "extractive_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=2)\n",
280
- "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')\n",
281
312
  "extractive_summary_generated_data.to_json(os.path.join(save_data_path, 'extractive_summary', 'gen.jsonl'), orient='records', lines=True)\n",
282
313
  "\n",
283
314
  "print(f\"✓ Extractive summary: {len(extractive_summary_generated_data)} records\")\n",
@@ -299,19 +330,19 @@
299
330
  "# Set model configuration\n",
300
331
  "flow = set_model_config(flow)\n",
301
332
  "\n",
302
- "runtime_params = ({flow_name_map[flow_name]: {\n",
303
- " 'n': number_of_summaries\n",
304
- " }})\n",
305
- "\n",
306
333
  "if enable_reasoning:\n",
307
334
  " # Increase max tokens to accommodate reasoning content\n",
308
335
  " runtime_params = {\n",
309
336
  " 'question_generation': {'max_tokens': 1024}, \n",
310
- " flow_name_map[flow_name]: {'n': number_of_summaries, 'max_tokens': 6000}\n",
337
+ " 'gen_detailed_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
311
338
  " }\n",
339
+ "else:\n",
340
+ " runtime_params = ({'gen_detailed_summary': {\n",
341
+ " 'n': number_of_summaries\n",
342
+ " }})\n",
312
343
  "# Generate data for detailed summary\n",
313
344
  "detailed_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=50)\n",
314
- "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')\n",
345
+ "\n",
315
346
  "detailed_summary_generated_data.to_json(os.path.join(save_data_path, 'detailed_summary', 'gen.jsonl'), orient='records', lines=True)\n",
316
347
  "\n",
317
348
  "print(f\"✓ Detailed summary: {len(detailed_summary_generated_data)} records\")\n",
@@ -332,7 +363,7 @@
332
363
  "\n",
333
364
  "# Set model configuration\n",
334
365
  "flow = set_model_config(flow)\n",
335
- "\n",
366
+ "runtime_params = {}\n",
336
367
  "if enable_reasoning:\n",
337
368
  " # Increase max tokens for Question Generation to accommodate reasoning content\n",
338
369
  " runtime_params = {\n",
@@ -340,9 +371,8 @@
340
371
  " }\n",
341
372
  "\n",
342
373
  "# Generate data for key facts summary\n",
343
- "key_facts_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=50)\n",
374
+ "key_facts_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
344
375
  "\n",
345
- "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')\n",
346
376
  "key_facts_generated_data.to_json(os.path.join(save_data_path, 'key_facts_to_qa', 'gen.jsonl'), orient='records', lines=True)\n",
347
377
  "\n",
348
378
  "print(f\"✓ Key facts: {len(key_facts_generated_data)} records\")\n",
@@ -362,16 +392,15 @@
362
392
  "\n",
363
393
  "# Set model configuration\n",
364
394
  "flow = set_model_config(flow)\n",
365
- "\n",
395
+ "runtime_params = {}\n",
366
396
  "if enable_reasoning:\n",
367
397
  " # Increase max tokens to accommodate reasoning content\n",
368
398
  " runtime_params = {\n",
369
- " 'question_generation': {'max_tokens': 1024}, \n",
399
+ " 'question_generation': {'max_tokens': 2048}, \n",
370
400
  " }\n",
371
401
  "\n",
372
- "document_based_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=200)\n",
402
+ "document_based_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
373
403
  " \n",
374
- "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')\n",
375
404
  "document_based_generated_data.to_json(os.path.join(save_data_path, 'document_based_qa', 'gen.jsonl'), orient='records', lines=True)\n",
376
405
  "\n",
377
406
  "print(f\"✓ Document based: {len(document_based_generated_data)} records\")\n",
@@ -393,7 +422,7 @@
393
422
  ],
394
423
  "metadata": {
395
424
  "kernelspec": {
396
- "display_name": "test_nb",
425
+ "display_name": "sdg_hub",
397
426
  "language": "python",
398
427
  "name": "python3"
399
428
  },
@@ -407,7 +436,7 @@
407
436
  "name": "python",
408
437
  "nbconvert_exporter": "python",
409
438
  "pygments_lexer": "ipython3",
410
- "version": "3.12.8"
439
+ "version": "3.11.12"
411
440
  }
412
441
  },
413
442
  "nbformat": 4,