sdg-hub 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/PKG-INFO +2 -8
  2. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +11 -15
  3. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +145 -98
  4. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +135 -63
  5. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +143 -119
  6. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +47 -30
  7. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/docparser_v2.py +50 -35
  8. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +5 -4
  9. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +26 -11
  10. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/logger_config.py +1 -0
  11. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/knowledge_utils.py +92 -72
  12. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/text_analysis/structured_insights_demo.ipynb +53 -43
  13. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/pyproject.toml +1 -8
  14. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/_version.py +3 -3
  15. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/base.py +60 -58
  16. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
  17. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/__init__.py +0 -2
  18. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
  19. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
  20. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
  21. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
  22. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
  23. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
  24. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
  25. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/melt_columns.py +13 -12
  26. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/rename_columns.py +20 -9
  27. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/text_concat.py +20 -21
  28. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
  29. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/base.py +139 -106
  30. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/checkpointer.py +34 -36
  31. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/validation.py +4 -4
  32. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/datautils.py +52 -54
  33. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/flow_metrics.py +9 -6
  34. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub.egg-info/PKG-INFO +2 -8
  35. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub.egg-info/SOURCES.txt +0 -8
  36. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub.egg-info/requires.txt +1 -8
  37. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/filtering/test_columnvaluefilter.py +29 -43
  38. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/llm/test_llm_chat_block.py +38 -40
  39. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/llm/test_llm_parser_block.py +41 -44
  40. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/llm/test_promptbuilderblock.py +26 -26
  41. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/llm/test_textparserblock.py +45 -42
  42. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/test_base_block.py +58 -62
  43. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/test_registry.py +40 -40
  44. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_index_based_mapper.py +49 -38
  45. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_json_structure_block.py +23 -23
  46. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_melt_columns.py +42 -43
  47. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_rename_columns.py +16 -17
  48. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_text_concat.py +17 -18
  49. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/transform/test_uniform_col_val_setter.py +33 -34
  50. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/conftest.py +12 -9
  51. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_base.py +57 -62
  52. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_checkpointer.py +26 -26
  53. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_dataset_requirements.py +71 -64
  54. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_integration.py +9 -11
  55. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_time_estimation.py +11 -11
  56. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +13 -6
  57. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/utils/test_datautils.py +81 -110
  58. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/utils/test_flow_metrics.py +5 -6
  59. sdg_hub-0.5.1/examples/annotation/annotation_classification.ipynb +0 -486
  60. sdg_hub-0.5.1/examples/annotation/news_classification_assessment_prompt.yaml +0 -42
  61. sdg_hub-0.5.1/examples/annotation/news_classification_flow.yaml +0 -210
  62. sdg_hub-0.5.1/examples/annotation/news_classification_prompt.yaml +0 -11
  63. sdg_hub-0.5.1/examples/annotation/revise_news_classification_prompt.yaml +0 -19
  64. sdg_hub-0.5.1/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
  65. sdg_hub-0.5.1/src/sdg_hub/core/utils/temp_manager.py +0 -57
  66. sdg_hub-0.5.1/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -1330
  67. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/actionlint.yaml +0 -0
  68. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/actions/free-disk-space/action.yml +0 -0
  69. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/dependabot.yml +0 -0
  70. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/mergify.yml +0 -0
  71. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/actionlint.dockerfile +0 -0
  72. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/actionlint.yml +0 -0
  73. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/docs.yml +0 -0
  74. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/integration-test.yml +0 -0
  75. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/lint.yml +0 -0
  76. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/matchers/actionlint.json +0 -0
  77. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/matchers/pylint.json +0 -0
  78. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/packer.yml +0 -0
  79. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/pypi.yaml +0 -0
  80. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.github/workflows/test.yml +0 -0
  81. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.gitignore +0 -0
  82. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.isort.cfg +0 -0
  83. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.markdownlint-cli2.yaml +0 -0
  84. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.pre-commit-config.yaml +0 -0
  85. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/.pylintrc +0 -0
  86. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/CLAUDE.md +0 -0
  87. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/CONTRIBUTING.md +0 -0
  88. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/LICENSE +0 -0
  89. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/Makefile +0 -0
  90. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/README.md +0 -0
  91. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/.nojekyll +0 -0
  92. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/README.md +0 -0
  93. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/_coverpage.md +0 -0
  94. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/_navbar.md +0 -0
  95. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/_sidebar.md +0 -0
  96. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/api-reference.md +0 -0
  97. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/blocks/custom-blocks.md +0 -0
  98. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/blocks/filtering-blocks.md +0 -0
  99. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/blocks/llm-blocks.md +0 -0
  100. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/blocks/overview.md +0 -0
  101. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/blocks/transform-blocks.md +0 -0
  102. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/concepts.md +0 -0
  103. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/development.md +0 -0
  104. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/flows/discovery.md +0 -0
  105. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/flows/overview.md +0 -0
  106. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/index.html +0 -0
  107. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/installation.md +0 -0
  108. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/docs/quick-start.md +0 -0
  109. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
  110. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  111. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  112. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/README.md +0 -0
  113. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  114. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  115. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  116. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  117. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  118. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  119. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  120. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/text_analysis/README.md +0 -0
  121. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  122. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/scripts/packer/centos.pkr.hcl +0 -0
  123. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/scripts/packer/setup-centos.sh +0 -0
  124. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/scripts/ruff.sh +0 -0
  125. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/setup.cfg +0 -0
  126. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/__init__.py +0 -0
  127. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/__init__.py +0 -0
  128. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/__init__.py +0 -0
  129. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  130. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  131. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/registry.py +0 -0
  132. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  133. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/__init__.py +0 -0
  134. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/metadata.py +0 -0
  135. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/flow/registry.py +0 -0
  136. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/__init__.py +0 -0
  137. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/error_handling.py +0 -0
  138. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  139. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  140. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/logger_config.py +0 -0
  141. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  142. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/time_estimator.py +0 -0
  143. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  144. {sdg_hub-0.5.1/tests → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa}/__init__.py +0 -0
  145. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary}/__init__.py +0 -0
  146. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  147. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -0
  148. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa}/__init__.py +0 -0
  149. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
  150. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary}/__init__.py +0 -0
  151. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  152. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -0
  153. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  154. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  155. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  156. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts}/__init__.py +0 -0
  157. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -0
  158. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  159. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/README.md +0 -0
  160. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  161. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  162. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  163. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  164. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  165. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  166. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  167. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -0
  168. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  169. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese}/README.md +0 -0
  170. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary → sdg_hub-0.6.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese}/__init__.py +0 -0
  171. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  172. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  173. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  174. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +0 -0
  175. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  176. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  177. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  178. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  179. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  180. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  181. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
  182. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  183. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub/py.typed +0 -0
  184. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  185. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/src/sdg_hub.egg-info/top_level.txt +0 -0
  186. {sdg_hub-0.5.1/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa → sdg_hub-0.6.0/tests}/__init__.py +0 -0
  187. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_config.yaml +0 -0
  188. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  189. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  190. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  191. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  192. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  193. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/__init__.py +0 -0
  194. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_metadata.py +0 -0
  195. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_registry.py +0 -0
  196. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/flow/test_validation.py +0 -0
  197. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/README.md +0 -0
  198. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/__init__.py +0 -0
  199. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  200. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  201. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  202. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
  203. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/utils/test_error_handling.py +0 -0
  204. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tests/utils/test_path_resolution.py +0 -0
  205. {sdg_hub-0.5.1 → sdg_hub-0.6.0}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
30
30
  Requires-Dist: rich
31
+ Requires-Dist: pandas
31
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
32
33
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
33
34
  Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
35
  Requires-Dist: tqdm<5.0.0,>=4.66.2
35
- Provides-Extra: vllm
36
- Requires-Dist: vllm>=0.9.1; extra == "vllm"
37
- Requires-Dist: torch>=2.0.0; extra == "vllm"
38
- Requires-Dist: transformers>=4.37.0; extra == "vllm"
39
- Requires-Dist: accelerate>=0.21.0; extra == "vllm"
40
- Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
41
36
  Provides-Extra: examples
42
37
  Requires-Dist: tabulate>=0.9.0; extra == "examples"
43
38
  Requires-Dist: transformers>=4.37.0; extra == "examples"
44
39
  Requires-Dist: langchain-text-splitters; extra == "examples"
45
40
  Requires-Dist: docling>=2.3.0; extra == "examples"
46
41
  Requires-Dist: scikit-learn; extra == "examples"
47
- Requires-Dist: pandas; extra == "examples"
48
42
  Requires-Dist: polars; extra == "examples"
49
43
  Requires-Dist: matplotlib; extra == "examples"
50
44
  Requires-Dist: spacy; extra == "examples"
@@ -34,7 +34,7 @@
34
34
  "source": [
35
35
  "# Step 1: Document Processing Pipeline\n",
36
36
  "# Define the directory containing raw documents to be processed\n",
37
- "data_dir = 'document_collection/'\n",
37
+ "data_dir = \"document_collection/\"\n",
38
38
  "\n",
39
39
  "# Run the document parser to convert documents to markdown\n",
40
40
  "# - input-dir: Directory containing source documents\n",
@@ -68,7 +68,7 @@
68
68
  "import glob\n",
69
69
  "\n",
70
70
  "# In our example above docling step produces markdown of all the pdf files in the document_collection\n",
71
- "with open(glob.glob(f'{data_dir}/*.md')[0], 'r') as f:\n",
71
+ "with open(glob.glob(f\"{data_dir}/*.md\")[0], \"r\") as f:\n",
72
72
  " text = f.read()"
73
73
  ]
74
74
  },
@@ -81,26 +81,22 @@
81
81
  "source": [
82
82
  "# Step 4: Text Chunking and Dataset Creation\n",
83
83
  "\n",
84
- "from markdown_it import MarkdownIt \n",
84
+ "from markdown_it import MarkdownIt\n",
85
85
  "from typing import List\n",
86
- "import datasets \n",
86
+ "import datasets\n",
87
87
  "\n",
88
88
  "\n",
89
- "def chunk_markdown(\n",
90
- " text: str,\n",
91
- " max_tokens: int = 200,\n",
92
- " overlap: int = 50\n",
93
- ") -> List[str]:\n",
89
+ "def chunk_markdown(text: str, max_tokens: int = 200, overlap: int = 50) -> List[str]:\n",
94
90
  " \"\"\"\n",
95
91
  " Splits Markdown text into chunks at block-level elements\n",
96
92
  " (headings, paragraphs, lists, tables, code, blockquotes).\n",
97
93
  " Adds overlap (in words) between all consecutive chunks.\n",
98
- " \n",
94
+ "\n",
99
95
  " Args:\n",
100
96
  " text: The markdown text to be chunked\n",
101
97
  " max_tokens: Maximum number of words per chunk\n",
102
98
  " overlap: Number of overlapping words between consecutive chunks\n",
103
- " \n",
99
+ "\n",
104
100
  " Returns:\n",
105
101
  " List of text chunks with specified overlap\n",
106
102
  " \"\"\"\n",
@@ -150,7 +146,7 @@
150
146
  "\n",
151
147
  "\n",
152
148
  "# Prepare seed data for the SDG-Hub knowledge pipeline.\n",
153
- "# \n",
149
+ "#\n",
154
150
  "# The seed data requires the following fields:\n",
155
151
  "# - document_outline: A concise title or summary that accurately represents the entire document.\n",
156
152
  "# For documents covering multiple themes, consider providing multiple outlines (one per section).\n",
@@ -161,7 +157,7 @@
161
157
  "# The code below creates a HuggingFace Dataset from the document chunks,\n",
162
158
  "# then maps the required ICL fields to each entry, and finally saves the result as a JSONL file.\n",
163
159
  "\n",
164
- "seed_data = datasets.Dataset.from_dict({'document': chunks})\n",
160
+ "seed_data = datasets.Dataset.from_dict({\"document\": chunks})\n",
165
161
  "\n",
166
162
  "icl = {\n",
167
163
  " \"document_outline\": \"The document contains excerpts from FINTRAC regulations designed to combat money laundering and terrorist financing in Canada\",\n",
@@ -169,14 +165,14 @@
169
165
  " \"icl_query_1\": \"In Canada, what are the methods for verifying someone's identity?\",\n",
170
166
  " \"icl_query_2\": \"In Canada, why is it important to confirm a client's identity?\",\n",
171
167
  " \"icl_query_3\": \"In Canada, can I use Reliance method to verify identity of a person?\",\n",
172
- " \"domain\": \"Finance\"\n",
168
+ " \"domain\": \"Finance\",\n",
173
169
  "}\n",
174
170
  "\n",
175
171
  "# Map the ICL fields to each document chunk (if you want to use the same ICL for all, as shown here)\n",
176
172
  "seed_data = seed_data.map(lambda x: icl)\n",
177
173
  "\n",
178
174
  "# Save the seed data to a JSONL file for downstream use\n",
179
- "seed_data.to_json('seed_data.jsonl', orient='records', lines=True)"
175
+ "seed_data.to_json(\"seed_data.jsonl\", orient=\"records\", lines=True)"
180
176
  ]
181
177
  },
182
178
  {
@@ -71,7 +71,7 @@
71
71
  "# Required to run the flow with async mode\n",
72
72
  "import nest_asyncio\n",
73
73
  "\n",
74
- "nest_asyncio.apply() "
74
+ "nest_asyncio.apply()"
75
75
  ]
76
76
  },
77
77
  {
@@ -80,82 +80,90 @@
80
80
  "metadata": {},
81
81
  "outputs": [],
82
82
  "source": [
83
- "def create_seed_data_from_quality_benchmark(run_on_validation=None, seed_data_path=None):\n",
83
+ "def create_seed_data_from_quality_benchmark(\n",
84
+ " run_on_validation=None, seed_data_path=None\n",
85
+ "):\n",
84
86
  " \"\"\"\n",
85
87
  " Create seed data from QuALITY Benchmark dataset.\n",
86
- " \n",
88
+ "\n",
87
89
  " Args:\n",
88
90
  " run_on_validation (bool, optional): If True, use validation subset. If None, reads from env.\n",
89
91
  " seed_data_path (str, optional): Path to save seed data. If None, reads from env.\n",
90
- " \n",
92
+ "\n",
91
93
  " Returns:\n",
92
94
  " datasets.Dataset: The processed corpus\n",
93
95
  " \"\"\"\n",
94
96
  " # Use environment variables as defaults if not provided\n",
95
97
  " if run_on_validation is None:\n",
96
- " run_on_validation = os.getenv('RUN_ON_VALIDATION_SET', 'true').lower() == 'true'\n",
98
+ " run_on_validation = os.getenv(\"RUN_ON_VALIDATION_SET\", \"true\").lower() == \"true\"\n",
97
99
  " if seed_data_path is None:\n",
98
- " seed_data_path = os.getenv('SEED_DATA_PATH', 'seed_data_val.jsonl')\n",
99
- " \n",
100
+ " seed_data_path = os.getenv(\"SEED_DATA_PATH\", \"seed_data_val.jsonl\")\n",
101
+ "\n",
100
102
  " # Load QuALITY Benchmark dataset\n",
101
103
  " print(\"Loading QuALITY Benchmark dataset...\")\n",
102
- " quality_corpus = load_dataset(\"zitongyang/entigraph-quality-corpus\", split='train').remove_columns(['entity', 'entigraph']).rename_columns({'raw': 'document', 'uid': 'document_outline'})\n",
103
- " \n",
104
+ " quality_corpus = (\n",
105
+ " load_dataset(\"zitongyang/entigraph-quality-corpus\", split=\"train\")\n",
106
+ " .remove_columns([\"entity\", \"entigraph\"])\n",
107
+ " .rename_columns({\"raw\": \"document\", \"uid\": \"document_outline\"})\n",
108
+ " )\n",
109
+ "\n",
104
110
  " # Define seed examples for knowledge tuning\n",
105
111
  " seed_examples = {\n",
106
112
  " \"icl_document\": (\n",
107
- " \"The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.\",\n",
108
- " \"Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.\"\n",
113
+ " \"The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.\",\n",
114
+ " \"Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.\",\n",
109
115
  " ),\n",
110
116
  " \"icl_query_1\": \"How does the technological solution address the economic *and* environmental challenges highlighted in the document?\",\n",
111
117
  " \"icl_query_2\": \"What implicit values or priorities do the community's actions (clean-up initiatives) and the technologists' project reflect, and how do these align or contrast?\",\n",
112
118
  " \"icl_query_3\": \"Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering document's themes?\",\n",
113
- " \"domain\": \"articles/essays\"\n",
119
+ " \"domain\": \"articles/essays\",\n",
114
120
  " }\n",
115
- " \n",
121
+ "\n",
116
122
  " # Add seed examples to the corpus\n",
117
123
  " quality_corpus = quality_corpus.map(lambda x: seed_examples)\n",
118
- " \n",
124
+ "\n",
119
125
  " if run_on_validation:\n",
120
126
  " # Validation set - use predefined document IDs for consistent evaluation\n",
121
127
  " DOC_UIDS = [\n",
122
- " ' Defining Decay Down by David Plotz',\n",
123
- " ' Fight Clubbed by David Plotz',\n",
124
- " ' I, Antichrist? by Jeffrey Goldberg',\n",
128
+ " \" Defining Decay Down by David Plotz\",\n",
129
+ " \" Fight Clubbed by David Plotz\",\n",
130
+ " \" I, Antichrist? by Jeffrey Goldberg\",\n",
125
131
  " \" It's Time To Keelhaul U-Haul! by Jeffrey Goldberg\",\n",
126
132
  " \" My Father's Estate by Ben Stein\",\n",
127
133
  " '\"Phone Me in Central Park\" by McConnell, James V.',\n",
128
- " 'A Coffin for Jacob by Ludwig, Edward W.',\n",
129
- " 'A Fall of Glass by Lee, Stanley R.',\n",
130
- " 'A Filbert Is a Nut by Raphael, Rick',\n",
131
- " 'A Gift from Earth by Banister, Manly',\n",
132
- " 'A Gleeb for Earth by Schafhauser, Charles',\n",
133
- " 'A Good Year for the Roses? by David Edelstein',\n",
134
- " 'A Pail of Air by Leiber, Fritz',\n",
135
- " 'A Planet Named Joe by Hunter, Evan',\n",
134
+ " \"A Coffin for Jacob by Ludwig, Edward W.\",\n",
135
+ " \"A Fall of Glass by Lee, Stanley R.\",\n",
136
+ " \"A Filbert Is a Nut by Raphael, Rick\",\n",
137
+ " \"A Gift from Earth by Banister, Manly\",\n",
138
+ " \"A Gleeb for Earth by Schafhauser, Charles\",\n",
139
+ " \"A Good Year for the Roses? by David Edelstein\",\n",
140
+ " \"A Pail of Air by Leiber, Fritz\",\n",
141
+ " \"A Planet Named Joe by Hunter, Evan\",\n",
136
142
  " \"AI: what's the worst that could happen? by Harry Armstrong\",\n",
137
- " 'Accidental Death by Baily, Peter',\n",
138
- " 'All Day September by Kuykendall, Roger',\n",
139
- " 'Ambition by Bade, William L.',\n",
140
- " 'And Then the Town Took Off by Wilson, Richard',\n",
141
- " 'Atom Mystery [Young Atom Detective] by Coombs, Charles Ira',\n",
142
- " 'Beach Scene by King, Marshall',\n",
143
- " 'Big Ancestor by Wallace, F. L. (Floyd L.)',\n",
144
- " 'Birds of a Feather by Silverberg, Robert',\n",
145
- " 'Bodyguard by Gold, H. L. (Horace Leonard)'\n",
143
+ " \"Accidental Death by Baily, Peter\",\n",
144
+ " \"All Day September by Kuykendall, Roger\",\n",
145
+ " \"Ambition by Bade, William L.\",\n",
146
+ " \"And Then the Town Took Off by Wilson, Richard\",\n",
147
+ " \"Atom Mystery [Young Atom Detective] by Coombs, Charles Ira\",\n",
148
+ " \"Beach Scene by King, Marshall\",\n",
149
+ " \"Big Ancestor by Wallace, F. L. (Floyd L.)\",\n",
150
+ " \"Birds of a Feather by Silverberg, Robert\",\n",
151
+ " \"Bodyguard by Gold, H. L. (Horace Leonard)\",\n",
146
152
  " ]\n",
147
- " \n",
153
+ "\n",
148
154
  " # Filter corpus to validation set\n",
149
- " quality_corpus = quality_corpus.filter(lambda x: x['document_outline'] in DOC_UIDS)\n",
155
+ " quality_corpus = quality_corpus.filter(\n",
156
+ " lambda x: x[\"document_outline\"] in DOC_UIDS\n",
157
+ " )\n",
150
158
  " print(f\"Running on validation set with {len(quality_corpus)} documents\")\n",
151
159
  " else:\n",
152
160
  " # Use full dataset for training\n",
153
161
  " print(f\"Running on full dataset with {len(quality_corpus)} documents\")\n",
154
- " \n",
162
+ "\n",
155
163
  " # Save the seed data\n",
156
- " quality_corpus.to_json(seed_data_path, orient='records', lines=True)\n",
164
+ " quality_corpus.to_json(seed_data_path, orient=\"records\", lines=True)\n",
157
165
  " print(f\"Saved seed data to: {seed_data_path}\")\n",
158
- " \n",
166
+ "\n",
159
167
  " return quality_corpus"
160
168
  ]
161
169
  },
@@ -166,19 +174,22 @@
166
174
  "outputs": [],
167
175
  "source": [
168
176
  "# Load seed data. If one is not provided, create it from the quality benchmark dataset.\n",
169
- "seed_data_path = os.getenv('SEED_DATA_PATH', 'seed_data.jsonl')\n",
177
+ "seed_data_path = os.getenv(\"SEED_DATA_PATH\", \"seed_data.jsonl\")\n",
170
178
  "\n",
171
179
  "if not os.path.exists(seed_data_path):\n",
172
180
  " print(f\"{seed_data_path} not found. Creating seed data...\")\n",
173
- " quality_corpus = create_seed_data_from_quality_benchmark(seed_data_path=seed_data_path)\n",
181
+ " quality_corpus = create_seed_data_from_quality_benchmark(\n",
182
+ " seed_data_path=seed_data_path\n",
183
+ " )\n",
174
184
  "else:\n",
175
185
  " print(f\"Loading existing seed data from {seed_data_path}\")\n",
176
- " quality_corpus = load_dataset('json', data_files=seed_data_path, split='train')\n",
186
+ " quality_corpus = load_dataset(\"json\", data_files=seed_data_path, split=\"train\")\n",
177
187
  "\n",
178
188
  "# Subsample the seed data. Useful for debugging.\n",
179
- "subsample = int(os.getenv('SEED_DATA_SUBSAMPLE', '0'))\n",
189
+ "subsample = int(os.getenv(\"SEED_DATA_SUBSAMPLE\", \"0\"))\n",
180
190
  "if subsample > 0:\n",
181
- " quality_corpus = quality_corpus.select(range(subsample))"
191
+ " quality_corpus = quality_corpus.select(range(subsample))\n",
192
+ "quality_corpus = quality_corpus.to_pandas()"
182
193
  ]
183
194
  },
184
195
  {
@@ -200,14 +211,20 @@
200
211
  "source": [
201
212
  "# Setup model configuration in flow object\n",
202
213
  "def set_model_config(flow_object):\n",
203
- " model_provider = os.getenv('MODEL_PROVIDER', 'hosted_vllm')\n",
214
+ " model_provider = os.getenv(\"MODEL_PROVIDER\", \"hosted_vllm\")\n",
204
215
  " print(f\"Using model provider: {model_provider}\")\n",
205
216
  " # Set model provider\n",
206
- " if model_provider == 'hosted_vllm': \n",
207
- " vllm_model = os.getenv('VLLM_MODEL', 'hosted_vllm/meta-llama/Llama-3.3-70B-Instruct')\n",
208
- " vllm_api_base = os.getenv('VLLM_API_BASE', 'http://localhost:8000/v1')\n",
209
- " vllm_api_key = os.getenv('VLLM_API_KEY', 'EMPTY')\n",
210
- " enable_reasoning = os.getenv('ENABLE_REASONING', 'false').lower() in ('1', 'true', 'yes')\n",
217
+ " if model_provider == \"hosted_vllm\":\n",
218
+ " vllm_model = os.getenv(\n",
219
+ " \"VLLM_MODEL\", \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\"\n",
220
+ " )\n",
221
+ " vllm_api_base = os.getenv(\"VLLM_API_BASE\", \"http://localhost:8000/v1\")\n",
222
+ " vllm_api_key = os.getenv(\"VLLM_API_KEY\", \"EMPTY\")\n",
223
+ " enable_reasoning = os.getenv(\"ENABLE_REASONING\", \"false\").lower() in (\n",
224
+ " \"1\",\n",
225
+ " \"true\",\n",
226
+ " \"yes\",\n",
227
+ " )\n",
211
228
  " print(f\"Using reasoning: {enable_reasoning}\")\n",
212
229
  " flow_object.set_model_config(\n",
213
230
  " model=vllm_model,\n",
@@ -215,30 +232,30 @@
215
232
  " api_key=vllm_api_key,\n",
216
233
  " enable_reasoning=enable_reasoning,\n",
217
234
  " )\n",
218
- " elif model_provider == 'openai':\n",
219
- " openai_api_key = os.getenv('OPENAI_API_KEY')\n",
220
- " openai_model = os.getenv('OPENAI_MODEL', 'openai/gpt-4')\n",
235
+ " elif model_provider == \"openai\":\n",
236
+ " openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n",
237
+ " openai_model = os.getenv(\"OPENAI_MODEL\", \"openai/gpt-4\")\n",
221
238
  " flow_object.set_model_config(\n",
222
239
  " model=openai_model,\n",
223
240
  " api_key=openai_api_key,\n",
224
241
  " )\n",
225
- " elif model_provider == 'ollama':\n",
226
- " ollama_model = os.getenv('OLLAMA_MODEL', 'ollama/gemma2')\n",
227
- " ollama_api_base = os.getenv('OLLAMA_API_BASE', 'http://localhost:11434')\n",
242
+ " elif model_provider == \"ollama\":\n",
243
+ " ollama_model = os.getenv(\"OLLAMA_MODEL\", \"ollama/gemma2\")\n",
244
+ " ollama_api_base = os.getenv(\"OLLAMA_API_BASE\", \"http://localhost:11434\")\n",
228
245
  " flow_object.set_model_config(\n",
229
246
  " model=ollama_model,\n",
230
247
  " api_base=ollama_api_base,\n",
231
248
  " )\n",
232
- " elif model_provider == 'maas':\n",
233
- " maas_model = os.getenv('MAAS_MODEL')\n",
234
- " maas_api_base = os.getenv('MAAS_API_BASE')\n",
235
- " maas_api_key = os.getenv('MAAS_API_KEY')\n",
249
+ " elif model_provider == \"maas\":\n",
250
+ " maas_model = os.getenv(\"MAAS_MODEL\")\n",
251
+ " maas_api_base = os.getenv(\"MAAS_API_BASE\")\n",
252
+ " maas_api_key = os.getenv(\"MAAS_API_KEY\")\n",
236
253
  " flow_object.set_model_config(\n",
237
254
  " model=maas_model,\n",
238
255
  " api_base=maas_api_base,\n",
239
256
  " api_key=maas_api_key,\n",
240
257
  " )\n",
241
- " return flow_object "
258
+ " return flow_object"
242
259
  ]
243
260
  },
244
261
  {
@@ -273,10 +290,14 @@
273
290
  "outputs": [],
274
291
  "source": [
275
292
  "# Get runtime parameters\n",
276
- "enable_reasoning = os.getenv('ENABLE_REASONING', 'false').lower() in ('1', 'true', 'yes')\n",
277
- "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
278
- "max_concurrency = int(os.getenv('MAX_CONCURRENCY', '50'))\n",
279
- "save_data_path = os.getenv('OUTPUT_DATA_FOLDER', '')"
293
+ "enable_reasoning = os.getenv(\"ENABLE_REASONING\", \"false\").lower() in (\n",
294
+ " \"1\",\n",
295
+ " \"true\",\n",
296
+ " \"yes\",\n",
297
+ ")\n",
298
+ "number_of_summaries = int(os.getenv(\"NUMBER_OF_SUMMARIES\", \"50\"))\n",
299
+ "max_concurrency = int(os.getenv(\"MAX_CONCURRENCY\", \"50\"))\n",
300
+ "save_data_path = os.getenv(\"OUTPUT_DATA_FOLDER\", \"\")"
280
301
  ]
281
302
  },
282
303
  {
@@ -292,28 +313,32 @@
292
313
  "\n",
293
314
  "# Set model configuration\n",
294
315
  "flow = set_model_config(flow)\n",
295
- "number_of_summaries = int(os.getenv('NUMBER_OF_SUMMARIES', '50'))\n",
316
+ "number_of_summaries = int(os.getenv(\"NUMBER_OF_SUMMARIES\", \"50\"))\n",
296
317
  "# Generate data for extractive summary\n",
297
318
  "if enable_reasoning:\n",
298
319
  " # Increase max tokens to accommodate reasoning content\n",
299
320
  " runtime_params = {\n",
300
- " 'question_generation': {'max_tokens': 1024}, \n",
301
- " 'gen_extractive_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
302
- " }\n",
303
- "else:\n",
304
- " runtime_params = {\n",
305
- " 'gen_extractive_summary': {\n",
306
- " 'n': number_of_summaries\n",
321
+ " \"question_generation\": {\"max_tokens\": 1024},\n",
322
+ " \"gen_extractive_summary\": {\"n\": number_of_summaries, \"max_tokens\": 6000},\n",
307
323
  " }\n",
308
- "}\n",
324
+ "else:\n",
325
+ " runtime_params = {\"gen_extractive_summary\": {\"n\": number_of_summaries}}\n",
309
326
  "\n",
310
- "extractive_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
327
+ "extractive_summary_generated_data = flow.generate(\n",
328
+ " quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
329
+ ")\n",
311
330
  "\n",
312
- "extractive_summary_generated_data.to_json(os.path.join(save_data_path, 'extractive_summary', 'gen.jsonl'), orient='records', lines=True)\n",
331
+ "os.makedirs(os.path.join(save_data_path, \"extractive_summary\"), exist_ok=True)\n",
332
+ "\n",
333
+ "extractive_summary_generated_data.to_json(\n",
334
+ " os.path.join(save_data_path, \"extractive_summary\", \"gen.jsonl\"),\n",
335
+ " orient=\"records\",\n",
336
+ " lines=True,\n",
337
+ ")\n",
313
338
  "\n",
314
339
  "print(f\"✓ Extractive summary: {len(extractive_summary_generated_data)} records\")\n",
315
340
  "\n",
316
- "print(f\"✓ Columns: {list(extractive_summary_generated_data.column_names)}\")"
341
+ "print(f\"✓ Columns: {list(extractive_summary_generated_data.columns.tolist())}\")"
317
342
  ]
318
343
  },
319
344
  {
@@ -333,21 +358,27 @@
333
358
  "if enable_reasoning:\n",
334
359
  " # Increase max tokens to accommodate reasoning content\n",
335
360
  " runtime_params = {\n",
336
- " 'question_generation': {'max_tokens': 1024}, \n",
337
- " 'gen_detailed_summary': {'n': number_of_summaries, 'max_tokens': 6000}\n",
338
- " }\n",
361
+ " \"question_generation\": {\"max_tokens\": 1024},\n",
362
+ " \"gen_detailed_summary\": {\"n\": number_of_summaries, \"max_tokens\": 6000},\n",
363
+ " }\n",
339
364
  "else:\n",
340
- " runtime_params = ({'gen_detailed_summary': {\n",
341
- " 'n': number_of_summaries\n",
342
- " }})\n",
365
+ " runtime_params = {\"gen_detailed_summary\": {\"n\": number_of_summaries}}\n",
343
366
  "# Generate data for detailed summary\n",
344
- "detailed_summary_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=50)\n",
367
+ "detailed_summary_generated_data = flow.generate(\n",
368
+ " quality_corpus, runtime_params=runtime_params, max_concurrency=50\n",
369
+ ")\n",
370
+ "\n",
371
+ "os.makedirs(os.path.join(save_data_path, \"detailed_summary\"), exist_ok=True)\n",
345
372
  "\n",
346
- "detailed_summary_generated_data.to_json(os.path.join(save_data_path, 'detailed_summary', 'gen.jsonl'), orient='records', lines=True)\n",
373
+ "detailed_summary_generated_data.to_json(\n",
374
+ " os.path.join(save_data_path, \"detailed_summary\", \"gen.jsonl\"),\n",
375
+ " orient=\"records\",\n",
376
+ " lines=True,\n",
377
+ ")\n",
347
378
  "\n",
348
379
  "print(f\"✓ Detailed summary: {len(detailed_summary_generated_data)} records\")\n",
349
380
  "\n",
350
- "print(f\"✓ Columns: {list(detailed_summary_generated_data.column_names)}\")"
381
+ "print(f\"✓ Columns: {list(detailed_summary_generated_data.columns.tolist())}\")"
351
382
  ]
352
383
  },
353
384
  {
@@ -356,7 +387,7 @@
356
387
  "metadata": {},
357
388
  "outputs": [],
358
389
  "source": [
359
- "# Generate similar data for key facts \n",
390
+ "# Generate similar data for key facts\n",
360
391
  "flow_name = \"Key Facts Knowledge Tuning Dataset Generation Flow\"\n",
361
392
  "flow_path = FlowRegistry.get_flow_path(flow_name)\n",
362
393
  "flow = Flow.from_yaml(flow_path)\n",
@@ -367,17 +398,25 @@
367
398
  "if enable_reasoning:\n",
368
399
  " # Increase max tokens for Question Generation to accommodate reasoning content\n",
369
400
  " runtime_params = {\n",
370
- " 'generate_key_fact_qa': {'max_tokens': 6000}, \n",
371
- " }\n",
401
+ " \"generate_key_fact_qa\": {\"max_tokens\": 6000},\n",
402
+ " }\n",
372
403
  "\n",
373
404
  "# Generate data for key facts summary\n",
374
- "key_facts_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
405
+ "key_facts_generated_data = flow.generate(\n",
406
+ " quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
407
+ ")\n",
408
+ "\n",
409
+ "os.makedirs(os.path.join(save_data_path, \"key_facts_to_qa\"), exist_ok=True)\n",
375
410
  "\n",
376
- "key_facts_generated_data.to_json(os.path.join(save_data_path, 'key_facts_to_qa', 'gen.jsonl'), orient='records', lines=True)\n",
411
+ "key_facts_generated_data.to_json(\n",
412
+ " os.path.join(save_data_path, \"key_facts_to_qa\", \"gen.jsonl\"),\n",
413
+ " orient=\"records\",\n",
414
+ " lines=True,\n",
415
+ ")\n",
377
416
  "\n",
378
417
  "print(f\"✓ Key facts: {len(key_facts_generated_data)} records\")\n",
379
418
  "\n",
380
- "print(f\"✓ Columns: {list(key_facts_generated_data.column_names)}\")"
419
+ "print(f\"✓ Columns: {list(key_facts_generated_data.columns.tolist())}\")"
381
420
  ]
382
421
  },
383
422
  {
@@ -396,16 +435,24 @@
396
435
  "if enable_reasoning:\n",
397
436
  " # Increase max tokens to accommodate reasoning content\n",
398
437
  " runtime_params = {\n",
399
- " 'question_generation': {'max_tokens': 2048}, \n",
400
- " }\n",
438
+ " \"question_generation\": {\"max_tokens\": 2048},\n",
439
+ " }\n",
440
+ "\n",
441
+ "document_based_generated_data = flow.generate(\n",
442
+ " quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency\n",
443
+ ")\n",
444
+ "\n",
445
+ "os.makedirs(os.path.join(save_data_path, \"document_based_qa\"), exist_ok=True)\n",
401
446
  "\n",
402
- "document_based_generated_data = flow.generate(quality_corpus, runtime_params=runtime_params, max_concurrency=max_concurrency)\n",
403
- " \n",
404
- "document_based_generated_data.to_json(os.path.join(save_data_path, 'document_based_qa', 'gen.jsonl'), orient='records', lines=True)\n",
447
+ "document_based_generated_data.to_json(\n",
448
+ " os.path.join(save_data_path, \"document_based_qa\", \"gen.jsonl\"),\n",
449
+ " orient=\"records\",\n",
450
+ " lines=True,\n",
451
+ ")\n",
405
452
  "\n",
406
453
  "print(f\"✓ Document based: {len(document_based_generated_data)} records\")\n",
407
454
  "\n",
408
- "print(f\"✓ Columns: {list(document_based_generated_data.column_names)}\")"
455
+ "print(f\"✓ Columns: {list(document_based_generated_data.columns.tolist())}\")"
409
456
  ]
410
457
  },
411
458
  {