sdg-hub 0.7.0__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/docs.yml +1 -1
  2. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/integration-test.yml +3 -2
  3. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/pypi.yaml +3 -3
  4. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/PKG-INFO +15 -14
  5. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +26 -17
  6. sdg_hub-0.7.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png +0 -0
  7. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/knowledge_utils.py +12 -6
  8. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/pyproject.toml +23 -16
  9. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/_version.py +3 -3
  10. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +9 -5
  11. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/base.py +6 -1
  12. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/PKG-INFO +15 -14
  13. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/SOURCES.txt +1 -0
  14. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/requires.txt +3 -1
  15. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_base.py +50 -4
  16. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tox.ini +6 -4
  17. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/actionlint.yaml +0 -0
  18. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/actions/free-disk-space/action.yml +0 -0
  19. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/dependabot.yml +0 -0
  20. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/mergify.yml +0 -0
  21. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/actionlint.dockerfile +0 -0
  22. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/actionlint.yml +0 -0
  23. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/lint.yml +0 -0
  24. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/matchers/actionlint.json +0 -0
  25. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/matchers/pylint.json +0 -0
  26. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/packer.yml +0 -0
  27. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/test.yml +0 -0
  28. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.gitignore +0 -0
  29. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.isort.cfg +0 -0
  30. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.markdownlint-cli2.yaml +0 -0
  31. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.pre-commit-config.yaml +0 -0
  32. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.pylintrc +0 -0
  33. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/CLAUDE.md +0 -0
  34. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/CONTRIBUTING.md +0 -0
  35. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/LICENSE +0 -0
  36. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/Makefile +0 -0
  37. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/README.md +0 -0
  38. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/.nojekyll +0 -0
  39. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/README.md +0 -0
  40. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_coverpage.md +0 -0
  41. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_navbar.md +0 -0
  42. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_sidebar.md +0 -0
  43. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/api-reference.md +0 -0
  44. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/assets/logo.png +0 -0
  45. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/assets/sdg-hub-cover.png +0 -0
  46. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/custom-blocks.md +0 -0
  47. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/filtering-blocks.md +0 -0
  48. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/llm-blocks.md +0 -0
  49. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/overview.md +0 -0
  50. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/transform-blocks.md +0 -0
  51. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/concepts.md +0 -0
  52. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/development.md +0 -0
  53. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/available-flows.md +0 -0
  54. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/custom-flows.md +0 -0
  55. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/discovery.md +0 -0
  56. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/overview.md +0 -0
  57. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/index.html +0 -0
  58. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/installation.md +0 -0
  59. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/quick-start.md +0 -0
  60. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
  61. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
  62. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
  63. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +0 -0
  64. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  65. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +0 -0
  66. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  67. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/README.md +0 -0
  68. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  69. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  70. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  71. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  72. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  73. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  74. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  75. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  76. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  77. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  78. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/knowledge_generation_ja.ipynb +0 -0
  79. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  80. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/rag_evaluation/ibm-annual-report-2024.pdf +0 -0
  81. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/rag_evaluation/rag_evaluation_dataset_generation.ipynb +0 -0
  82. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/README.md +0 -0
  83. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  84. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
  85. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/packer/centos.pkr.hcl +0 -0
  86. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/packer/setup-centos.sh +0 -0
  87. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/ruff.sh +0 -0
  88. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/snyk_notebook_scan.sh +0 -0
  89. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/setup.cfg +0 -0
  90. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/__init__.py +0 -0
  91. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/__init__.py +0 -0
  92. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/__init__.py +0 -0
  93. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/base.py +0 -0
  94. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  95. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  96. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  97. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  98. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
  99. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  100. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
  101. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/registry.py +0 -0
  102. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  103. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  104. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  105. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
  106. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  107. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
  108. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  109. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  110. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/__init__.py +0 -0
  111. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  112. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/metadata.py +0 -0
  113. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/registry.py +0 -0
  114. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/validation.py +0 -0
  115. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/__init__.py +0 -0
  116. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/datautils.py +0 -0
  117. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/error_handling.py +0 -0
  118. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  119. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  120. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
  121. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/logger_config.py +0 -0
  122. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  123. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/time_estimator.py +0 -0
  124. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  125. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/__init__.py +0 -0
  126. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/answer_generation.yaml +0 -0
  127. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/conceptual_qa_generation.yaml +0 -0
  128. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/context_extraction.yaml +0 -0
  129. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/flow.yaml +0 -0
  130. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/groundedness_critic.yaml +0 -0
  131. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/question_evolution.yaml +0 -0
  132. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/topic_generation.yaml +0 -0
  133. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  134. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  135. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  136. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -0
  137. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  138. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
  139. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  140. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  141. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -0
  142. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  143. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  144. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  145. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  146. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -0
  147. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  148. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  149. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  150. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  151. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  152. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  153. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  154. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  155. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  156. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -0
  157. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  158. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  159. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  160. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  161. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  162. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  163. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +0 -0
  164. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  165. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  166. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  167. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  168. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  169. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  170. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
  171. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  172. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/py.typed +0 -0
  173. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  174. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
  175. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/__init__.py +0 -0
  176. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  177. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  178. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_llm_parser_block.py +0 -0
  179. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  180. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_textparserblock.py +0 -0
  181. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/test_base_block.py +0 -0
  182. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/test_registry.py +0 -0
  183. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_config.yaml +0 -0
  184. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  185. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  186. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  187. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  188. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  189. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  190. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_json_structure_block.py +0 -0
  191. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_melt_columns.py +0 -0
  192. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_rename_columns.py +0 -0
  193. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_text_concat.py +0 -0
  194. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
  195. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/__init__.py +0 -0
  196. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/conftest.py +0 -0
  197. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_checkpointer.py +0 -0
  198. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_dataset_requirements.py +0 -0
  199. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_integration.py +0 -0
  200. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_metadata.py +0 -0
  201. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_registry.py +0 -0
  202. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_time_estimation.py +0 -0
  203. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_validation.py +0 -0
  204. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/README.md +0 -0
  205. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/__init__.py +0 -0
  206. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  207. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  208. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  209. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
  210. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +0 -0
  211. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_datautils.py +0 -0
  212. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_error_handling.py +0 -0
  213. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_flow_metrics.py +0 -0
  214. {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_path_resolution.py +0 -0
@@ -39,6 +39,6 @@ jobs:
39
39
  - name: "Checkout"
40
40
  uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
41
41
  - name: "Check Markdown documents"
42
- uses: DavidAnson/markdownlint-cli2-action@30a0e04f1870d58f8d717450cc6134995f993c63 # v21.0.0
42
+ uses: DavidAnson/markdownlint-cli2-action@07035fd053f7be764496c0f8d8f9f41f98305101 # v22.0.0
43
43
  with:
44
44
  globs: '**/*.md'
@@ -112,7 +112,7 @@ jobs:
112
112
 
113
113
 
114
114
  - name: Cache huggingface datasets
115
- uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
115
+ uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
116
116
  with:
117
117
  path: ~/.cache/huggingface
118
118
  # Invalidate cache when any example notebook changes (may affect dataset downloads)
@@ -127,6 +127,7 @@ jobs:
127
127
  env:
128
128
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
129
129
  run: |
130
+ # Uses .[dev,integration] - lightweight, no torch/transformers
130
131
  tox -e py3-integrationcov
131
132
 
132
133
 
@@ -139,7 +140,7 @@ jobs:
139
140
  flags: integration
140
141
 
141
142
  - name: Upload integration test artifacts
142
- uses: actions/upload-artifact@v5
143
+ uses: actions/upload-artifact@v6
143
144
  if: always()
144
145
  with:
145
146
  name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
@@ -72,7 +72,7 @@ jobs:
72
72
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
73
73
 
74
74
  - name: "Download build artifacts"
75
- uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
75
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
76
76
  with:
77
77
  name: Packages
78
78
  path: dist
@@ -104,13 +104,13 @@ jobs:
104
104
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
105
105
 
106
106
  - name: "Download build artifacts"
107
- uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
107
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
108
108
  with:
109
109
  name: Packages
110
110
  path: dist
111
111
 
112
112
  - name: "Sigstore sign package"
113
- uses: sigstore/gh-action-sigstore-python@f832326173235dcb00dd5d92cd3f353de3188e6c # v3.1.0
113
+ uses: sigstore/gh-action-sigstore-python@a5caf349bc536fbef3668a10ed7f5cd309a4b53d # v3.2.0
114
114
  with:
115
115
  inputs: |
116
116
  ./dist/*.tar.gz
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -33,6 +33,20 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
33
33
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
34
34
  Requires-Dist: tenacity!=8.4.0,>=8.3.0
35
35
  Requires-Dist: tqdm<5.0.0,>=4.66.2
36
+ Provides-Extra: dev
37
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
38
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
39
+ Requires-Dist: pylint-pydantic; extra == "dev"
40
+ Requires-Dist: pytest; extra == "dev"
41
+ Requires-Dist: pytest-asyncio; extra == "dev"
42
+ Requires-Dist: pytest-cov; extra == "dev"
43
+ Requires-Dist: pytest-html; extra == "dev"
44
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
45
+ Requires-Dist: ruff; extra == "dev"
46
+ Requires-Dist: pytest-env; extra == "dev"
47
+ Requires-Dist: nbconvert>=7.0.0; extra == "dev"
48
+ Provides-Extra: integration
49
+ Requires-Dist: nest-asyncio; extra == "integration"
36
50
  Provides-Extra: examples
37
51
  Requires-Dist: tabulate>=0.9.0; extra == "examples"
38
52
  Requires-Dist: transformers>=4.37.0; extra == "examples"
@@ -46,20 +60,7 @@ Requires-Dist: nltk; extra == "examples"
46
60
  Requires-Dist: sentence-transformers; extra == "examples"
47
61
  Requires-Dist: instructor; extra == "examples"
48
62
  Requires-Dist: fastapi; extra == "examples"
49
- Requires-Dist: nest-asyncio; extra == "examples"
50
63
  Requires-Dist: ipykernel; extra == "examples"
51
- Provides-Extra: dev
52
- Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
53
- Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
54
- Requires-Dist: pylint-pydantic; extra == "dev"
55
- Requires-Dist: pytest; extra == "dev"
56
- Requires-Dist: pytest-asyncio; extra == "dev"
57
- Requires-Dist: pytest-cov; extra == "dev"
58
- Requires-Dist: pytest-html; extra == "dev"
59
- Requires-Dist: tox<5,>=4.4.2; extra == "dev"
60
- Requires-Dist: ruff; extra == "dev"
61
- Requires-Dist: pytest-env; extra == "dev"
62
- Requires-Dist: nbconvert>=7.0.0; extra == "dev"
63
64
  Dynamic: license-file
64
65
 
65
66
  # `sdg_hub`: Synthetic Data Generation Toolkit
@@ -48,29 +48,38 @@ Only claims passing this check are retained. This process filters out **hallucin
48
48
 
49
49
  ---
50
50
 
51
- ## Data Generation Statistics
51
+ ## Data Generation Statistics and Results
52
+
53
+ **Teacher model for generation:** `openai/gpt-oss-120b`
54
+ **Student model trained:** `meta-llama/Llama-3.1-8B-Instruct`
55
+ **Training method:** Supervised Fine-Tuning (SFT)
56
+
57
+ ---
52
58
 
53
59
  ### Summary Augmentation
54
60
 
55
- Each “cut” represents the total number of summaries generated per document across all three augmentation types.
61
+ For each document, we generate three augmentation types—detailed summaries, extractive summaries, and atomic facts. Each “cut” on the table below represents the total number of summary augmentations per document (i.e., how many times each augmentation process is run).
56
62
 
57
- | Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
58
- | ------------------------------- | ----------- |
59
- | 1 | 2,193,502 |
60
- | 2 | 4,383,655 |
61
- | 5 | 10,870,396 |
62
- | 10 | 21,815,170 |
63
- | 20 | 43,601,976 |
64
- | 30 | 65,395,710 |
65
- | 40 | 87,118,308 |
66
- | 50 | 108,779,213 |
63
+ | Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
64
+ | ------------------------------- | ------------- |
65
+ | Input Corpus | 1,517,465 |
66
+ | 10 | 87,248,889 |
67
+ | 20 | 158,615,276 |
68
+ | 30 | 230,306,195 |
69
+ | 40 | 301,805,906 |
70
+ | 50 | 373,183,414 |
67
71
 
68
72
  ---
69
73
 
70
- ### Finance Bench Example
74
+ ### Benchmark Results
71
75
 
72
- For Finance Bench (NUMBER\_OF\_SUMMARIES = 1):
76
+ - **Evaluation benchmark:** [QuALITY benchmark](https://nyu-mll.github.io/quality/)
77
+ - **Evaluation script & metric:** [Synthetic_Continued_Pretraining](https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/evaluation.py), Exact Match (EM)
78
+ - **Student model:** meta-llama/Llama-3.1-8B-Instruct (after SFT on generated/augmented summaries)
79
+ - **Performance metric:** Model accuracy
73
80
 
74
- | Cut | Token Count |
75
- | --- | ----------- |
76
- | 50 | 213,333,192 |
81
+ ![Quality Benchmark Accuracy](imgs/quality_benchmark_accuracy.png)
82
+
83
+ *Figure: Model accuracy across the QuALITY benchmark datasets, comparing SFT training on enhanced document summaries with the original model performance.*
84
+
85
+ ---
@@ -602,13 +602,14 @@ def _num_chars_from_tokens(num_tokens) -> int:
602
602
  return int(num_tokens * 4) # 1 token ~ 4 English character
603
603
 
604
604
 
605
- def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
605
+ def chunk_document(documents: List, server_ctx_size, chunk_word_count, **kwargs) -> List[str]:
606
606
  """
607
607
  Iterates over the documents and splits them into chunks based on the word count provided by the user.
608
608
  Args:
609
609
  documents (list): List of documents retrieved from git (can also consist of a single document).
610
610
  server_ctx_size (int): Context window size of server.
611
611
  chunk_word_count (int): Maximum number of words to chunk a document.
612
+ chunk_overlap (int): Overlap in characters between chunks.
612
613
  Returns:
613
614
  List[str]: List of chunked documents.
614
615
  """
@@ -634,7 +635,7 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
634
635
  # Placeholder for params
635
636
  content = []
636
637
  chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
637
- chunk_overlap = _DEFAULT_CHUNK_OVERLAP
638
+ chunk_overlap = int(kwargs.pop("chunk_overlap", str(_DEFAULT_CHUNK_OVERLAP)))
638
639
 
639
640
  # Using Markdown as default, document-specific chunking will be implemented in seperate pr.
640
641
  text_splitter = RecursiveCharacterTextSplitter.from_language(
@@ -729,16 +730,21 @@ class DocProcessor:
729
730
  }
730
731
  )
731
732
 
732
- def _add_icls(self, chunked_document: Dataset) -> Dataset:
733
+ def _add_icls(self, chunked_document: Dataset, **kwargs) -> Dataset:
733
734
  """
734
735
  Add the ICLS label to the dataset.
735
736
  Args:
736
737
  dataset (Dataset): Dataset object.
738
+ server_ctx_size (int): Context window size of server.
739
+ chunk_word_count (int): Maximum number of words to chunk a document.
740
+ chunk_overlap (int): Overlap in characters between chunks.
737
741
 
738
742
  Returns
739
743
  -------
740
744
  Dataset: Dataset object with ICLS label.
741
745
  """
746
+ server_ctx_size = int(kwargs.pop("server_ctx_size", "4096"))
747
+ chunk_word_count = int(kwargs.pop("chunk_word_count", "1024"))
742
748
  icl = self.user_config["seed_examples"]
743
749
  chunked_document_all_icl = []
744
750
  for icl_ in icl:
@@ -762,7 +768,7 @@ class DocProcessor:
762
768
  chunked_document_all_icl = chunked_document_all_icl.map(
763
769
  lambda x: {
764
770
  "chunks": chunk_document(
765
- [x["document"]], server_ctx_size=4096, chunk_word_count=1024
771
+ [x["document"]], server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, **kwargs
766
772
  )
767
773
  if get_token_count(x["document"], self.tokenizer) > 1024
768
774
  else [x["document"]]
@@ -797,7 +803,7 @@ class DocProcessor:
797
803
  df = safe_concatenate_datasets([ds.to_pandas() for ds in datasets])
798
804
  return Dataset.from_pandas(df) if df is not None else None
799
805
 
800
- def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
806
+ def get_processed_markdown_dataset(self, list_md_files: list[Path], **kwargs) -> Dataset:
801
807
  chunks_mds = []
802
808
  for md_file in list_md_files:
803
809
  with open(md_file, "r", encoding="utf-8") as f:
@@ -811,5 +817,5 @@ class DocProcessor:
811
817
  }
812
818
  )
813
819
  chunk_ds = Dataset.from_list(chunks_mds)
814
- chunk_ds_with_icls = self._add_icls(chunk_ds)
820
+ chunk_ds_with_icls = self._add_icls(chunk_ds, **kwargs)
815
821
  return chunk_ds_with_icls
@@ -51,22 +51,7 @@ source = "https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub"
51
51
  issues = "https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues"
52
52
 
53
53
  [project.optional-dependencies]
54
- examples = [
55
- "tabulate>=0.9.0",
56
- "transformers>=4.37.0",
57
- "langchain-text-splitters",
58
- "docling>=2.3.0",
59
- "scikit-learn",
60
- "polars",
61
- "matplotlib",
62
- "spacy",
63
- "nltk",
64
- "sentence-transformers",
65
- "instructor",
66
- "fastapi",
67
- "nest-asyncio",
68
- "ipykernel",
69
- ]
54
+ # Development and testing dependencies (lightweight, no ML libraries)
70
55
  dev = [
71
56
  "pre-commit>=3.0.4,<4.0",
72
57
  "pylint>=2.16.2,<4.0",
@@ -81,6 +66,28 @@ dev = [
81
66
  # Integration testing dependencies
82
67
  "nbconvert>=7.0.0",
83
68
  ]
69
+ # Minimal dependencies for integration testing only
70
+ # Integration tests run knowledge_generation.ipynb which only needs nest-asyncio
71
+ integration = [
72
+ "nest-asyncio",
73
+ ]
74
+ # Heavy dependencies for example notebooks (knowledge_mixing.ipynb, etc.)
75
+ # NOT required for core functionality testing or integration tests
76
+ examples = [
77
+ "tabulate>=0.9.0",
78
+ "transformers>=4.37.0", # For knowledge_mixing.ipynb, NOT integration tests
79
+ "langchain-text-splitters",
80
+ "docling>=2.3.0", # For document parsing examples
81
+ "scikit-learn", # For raft_builder.py utility
82
+ "polars", # For knowledge_mixing_utils.py
83
+ "matplotlib",
84
+ "spacy",
85
+ "nltk",
86
+ "sentence-transformers",
87
+ "instructor",
88
+ "fastapi",
89
+ "ipykernel",
90
+ ]
84
91
 
85
92
  [tool.setuptools_scm]
86
93
  version_file = "src/sdg_hub/_version.py"
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.7.0'
32
- __version_tuple__ = version_tuple = (0, 7, 0)
31
+ __version__ = version = '0.7.2'
32
+ __version_tuple__ = version_tuple = (0, 7, 2)
33
33
 
34
- __commit_id__ = commit_id = 'g33f3e7e56'
34
+ __commit_id__ = commit_id = 'g99a40a268'
@@ -6,7 +6,8 @@ from typing import Any, Optional
6
6
  import asyncio
7
7
 
8
8
  from litellm import acompletion, completion
9
- from pydantic import ConfigDict, Field, field_validator
9
+ from pydantic import ConfigDict, Field, SecretStr, field_validator
10
+ from tqdm.asyncio import tqdm_asyncio
10
11
  import litellm
11
12
 
12
13
  # Third Party
@@ -52,8 +53,9 @@ class LLMChatBlock(BaseBlock):
52
53
  model : Optional[str], optional
53
54
  Model identifier in LiteLLM format. Can be set later via flow.set_model_config().
54
55
  Examples: "openai/gpt-4", "anthropic/claude-3-sonnet-20240229"
55
- api_key : Optional[str], optional
56
+ api_key : Optional[SecretStr], optional
56
57
  API key for the provider. Falls back to environment variables.
58
+ Automatically redacted in logs and string representations.
57
59
  api_base : Optional[str], optional
58
60
  Base URL for the API. Required for local models.
59
61
  async_mode : bool, optional
@@ -97,7 +99,7 @@ class LLMChatBlock(BaseBlock):
97
99
  model: Optional[str] = Field(
98
100
  None, exclude=True, description="Model identifier in LiteLLM format"
99
101
  )
100
- api_key: Optional[str] = Field(
102
+ api_key: Optional[SecretStr] = Field(
101
103
  None, exclude=True, description="API key for the provider"
102
104
  )
103
105
  api_base: Optional[str] = Field(
@@ -301,7 +303,7 @@ class LLMChatBlock(BaseBlock):
301
303
  if self.model is not None:
302
304
  completion_kwargs["model"] = self.model
303
305
  if self.api_key is not None:
304
- completion_kwargs["api_key"] = self.api_key
306
+ completion_kwargs["api_key"] = self.api_key.get_secret_value()
305
307
  if self.api_base is not None:
306
308
  completion_kwargs["api_base"] = self.api_base
307
309
  if self.timeout is not None:
@@ -501,7 +503,9 @@ class LLMChatBlock(BaseBlock):
501
503
  for messages in messages_list
502
504
  ]
503
505
 
504
- responses = await asyncio.gather(*tasks)
506
+ responses = await tqdm_asyncio.gather(
507
+ *tasks, desc=self.block_name, unit="req"
508
+ )
505
509
  return responses
506
510
 
507
511
  except Exception as e:
@@ -13,6 +13,7 @@ from pydantic import (
13
13
  ConfigDict,
14
14
  Field,
15
15
  PrivateAttr,
16
+ SecretStr,
16
17
  field_validator,
17
18
  model_validator,
18
19
  )
@@ -793,7 +794,10 @@ class Flow(BaseModel):
793
794
  if api_base is not None:
794
795
  config_params["api_base"] = api_base
795
796
  if api_key is not None:
796
- config_params["api_key"] = api_key
797
+ # Convert string api_key to SecretStr for automatic redaction in logs
798
+ config_params["api_key"] = (
799
+ SecretStr(api_key) if isinstance(api_key, str) else api_key
800
+ )
797
801
 
798
802
  # Add any additional kwargs (temperature, max_tokens, etc.)
799
803
  config_params.update(kwargs)
@@ -855,6 +859,7 @@ class Flow(BaseModel):
855
859
 
856
860
  if modified_count > 0:
857
861
  # Enhanced logging showing what was configured
862
+ # Note: SecretStr values automatically display as '**********' in logs
858
863
  param_summary = []
859
864
  for param_name, param_value in config_params.items():
860
865
  if param_name == "model":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -33,6 +33,20 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
33
33
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
34
34
  Requires-Dist: tenacity!=8.4.0,>=8.3.0
35
35
  Requires-Dist: tqdm<5.0.0,>=4.66.2
36
+ Provides-Extra: dev
37
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
38
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
39
+ Requires-Dist: pylint-pydantic; extra == "dev"
40
+ Requires-Dist: pytest; extra == "dev"
41
+ Requires-Dist: pytest-asyncio; extra == "dev"
42
+ Requires-Dist: pytest-cov; extra == "dev"
43
+ Requires-Dist: pytest-html; extra == "dev"
44
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
45
+ Requires-Dist: ruff; extra == "dev"
46
+ Requires-Dist: pytest-env; extra == "dev"
47
+ Requires-Dist: nbconvert>=7.0.0; extra == "dev"
48
+ Provides-Extra: integration
49
+ Requires-Dist: nest-asyncio; extra == "integration"
36
50
  Provides-Extra: examples
37
51
  Requires-Dist: tabulate>=0.9.0; extra == "examples"
38
52
  Requires-Dist: transformers>=4.37.0; extra == "examples"
@@ -46,20 +60,7 @@ Requires-Dist: nltk; extra == "examples"
46
60
  Requires-Dist: sentence-transformers; extra == "examples"
47
61
  Requires-Dist: instructor; extra == "examples"
48
62
  Requires-Dist: fastapi; extra == "examples"
49
- Requires-Dist: nest-asyncio; extra == "examples"
50
63
  Requires-Dist: ipykernel; extra == "examples"
51
- Provides-Extra: dev
52
- Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
53
- Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
54
- Requires-Dist: pylint-pydantic; extra == "dev"
55
- Requires-Dist: pytest; extra == "dev"
56
- Requires-Dist: pytest-asyncio; extra == "dev"
57
- Requires-Dist: pytest-cov; extra == "dev"
58
- Requires-Dist: pytest-html; extra == "dev"
59
- Requires-Dist: tox<5,>=4.4.2; extra == "dev"
60
- Requires-Dist: ruff; extra == "dev"
61
- Requires-Dist: pytest-env; extra == "dev"
62
- Requires-Dist: nbconvert>=7.0.0; extra == "dev"
63
64
  Dynamic: license-file
64
65
 
65
66
  # `sdg_hub`: Synthetic Data Generation Toolkit
@@ -54,6 +54,7 @@ examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation
54
54
  examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb
55
55
  examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py
56
56
  examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py
57
+ examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png
57
58
  examples/knowledge_tuning/instructlab/.gitignore
58
59
  examples/knowledge_tuning/instructlab/README.md
59
60
  examples/knowledge_tuning/instructlab/docling_v2_config.yaml
@@ -36,5 +36,7 @@ nltk
36
36
  sentence-transformers
37
37
  instructor
38
38
  fastapi
39
- nest-asyncio
40
39
  ipykernel
40
+
41
+ [integration]
42
+ nest-asyncio
@@ -527,13 +527,15 @@ class TestFlow:
527
527
  ):
528
528
  """Create a mock LLM block with model attributes."""
529
529
  # First Party
530
+ from pydantic import SecretStr
530
531
  from tests.flow.conftest import MockBlock
531
532
 
532
533
  block = MockBlock(block_name=name, input_cols=["input"], output_cols=["output"])
533
534
  # Add LLM-related attributes
534
535
  block.model = model
535
536
  block.api_base = api_base
536
- block.api_key = api_key
537
+ # Convert api_key to SecretStr to match real LLM blocks
538
+ block.api_key = SecretStr(api_key) if isinstance(api_key, str) else api_key
537
539
  block.temperature = 0.0
538
540
  block.max_tokens = 1024
539
541
  return block
@@ -637,7 +639,7 @@ class TestFlow:
637
639
  # Check that LLM blocks were modified
638
640
  assert flow.blocks[1].model == "new-model" # llm_block1
639
641
  assert flow.blocks[1].api_base == "http://localhost:8101/v1"
640
- assert flow.blocks[1].api_key == "NEW_KEY"
642
+ assert flow.blocks[1].api_key.get_secret_value() == "NEW_KEY"
641
643
  assert flow.blocks[1].temperature == 0.7
642
644
  assert flow.blocks[1].max_tokens == 2048
643
645
 
@@ -696,7 +698,7 @@ class TestFlow:
696
698
 
697
699
  # Other parameters should remain unchanged
698
700
  assert flow.blocks[0].api_base == "http://localhost:8000/v1"
699
- assert flow.blocks[0].api_key == "OLD_KEY"
701
+ assert flow.blocks[0].api_key.get_secret_value() == "OLD_KEY"
700
702
  assert flow.blocks[0].max_tokens == 1024
701
703
 
702
704
  def test_set_model_config_with_kwargs(self):
@@ -793,7 +795,7 @@ class TestFlow:
793
795
 
794
796
  # Everything else should remain the same
795
797
  assert flow.blocks[0].api_base == "http://localhost:8000/v1"
796
- assert flow.blocks[0].api_key == "ORIGINAL_KEY"
798
+ assert flow.blocks[0].api_key.get_secret_value() == "ORIGINAL_KEY"
797
799
  assert flow.blocks[0].temperature == 0.5
798
800
  assert flow.blocks[0].max_tokens == 1024
799
801
  assert flow.blocks[0].custom_param == "custom_value"
@@ -1422,3 +1424,47 @@ class TestFlow:
1422
1424
  FlowValidationError, match="max_concurrency must be greater than 0"
1423
1425
  ):
1424
1426
  flow.generate(dataset, max_concurrency=-1)
1427
+
1428
+ def test_set_model_config_redacts_sensitive_params(self, caplog):
1429
+ """Test API key and secrets redaction in logs using Pydantic SecretStr.
1430
+
1431
+ Verifies that sensitive parameters (api_key) are automatically redacted
1432
+ by SecretStr while non-sensitive ones remain visible.
1433
+ """
1434
+ # Standard
1435
+ import logging
1436
+
1437
+ # First Party
1438
+ from sdg_hub.core.blocks.llm.llm_chat_block import LLMChatBlock
1439
+
1440
+ llm_block = LLMChatBlock(
1441
+ block_name="test_llm", input_cols="messages", output_cols="response"
1442
+ )
1443
+ flow = Flow(metadata=self.test_metadata, blocks=[llm_block])
1444
+
1445
+ with caplog.at_level(logging.INFO, logger="sdg_hub.core.flow.base"):
1446
+ flow.set_model_config(
1447
+ model="openai/gpt-4",
1448
+ api_key="sk-secret-key",
1449
+ temperature=0.7,
1450
+ max_tokens=100,
1451
+ )
1452
+
1453
+ log_messages = [record.message for record in caplog.records]
1454
+ relevant_logs = [
1455
+ msg for msg in log_messages if "Successfully configured" in msg
1456
+ ]
1457
+ assert len(relevant_logs) > 0
1458
+ log_text = relevant_logs[0]
1459
+
1460
+ # Sensitive params must be redacted - SecretStr displays as '**********'
1461
+ assert "**********" in log_text or "SecretStr" in log_text
1462
+ assert "sk-secret-key" not in log_text
1463
+
1464
+ # Non-sensitive params should be visible
1465
+ assert "temperature: 0.7" in log_text
1466
+ assert "max_tokens: 100" in log_text
1467
+
1468
+ # Verify that the api_key was actually set as a SecretStr on the block
1469
+ assert llm_block.api_key is not None
1470
+ assert llm_block.api_key.get_secret_value() == "sk-secret-key"
@@ -22,28 +22,30 @@ commands =
22
22
  integration: {envpython} -m pytest {posargs:tests/integration}
23
23
 
24
24
  # Integration test environment - runs notebook-based integration tests
25
+ # Lightweight: only requires nest-asyncio, NO torch/transformers needed
25
26
  [testenv:py3-integration]
26
- description = run integration tests (notebooks)
27
+ description = run integration tests (notebooks) - lightweight, no torch/transformers needed
27
28
  package = wheel
28
29
  wheel_build_env = pkg
29
30
  passenv =
30
31
  OPENAI_API_KEY
31
32
  deps =
32
33
  .[dev]
33
- .[examples]
34
+ .[integration]
34
35
  commands =
35
36
  {envpython} -m pytest {posargs:tests/integration -v -s}
36
37
 
37
38
  # Integration test environment with coverage - runs notebook-based integration tests with coverage collection
39
+ # Lightweight: only requires nest-asyncio, NO torch/transformers needed
38
40
  [testenv:py3-integrationcov]
39
- description = run integration tests (notebooks) with coverage
41
+ description = run integration tests (notebooks) with coverage - lightweight, no torch/transformers needed
40
42
  package = wheel
41
43
  wheel_build_env = pkg
42
44
  passenv =
43
45
  OPENAI_API_KEY
44
46
  deps =
45
47
  .[dev]
46
- .[examples]
48
+ .[integration]
47
49
  commands =
48
50
  {envpython} -m pytest --cov=sdg_hub --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html tests/integration {posargs:-v -s}
49
51
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes