sdg-hub 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/pypi.yaml +2 -2
  2. {sdg_hub-0.2.2/src/sdg_hub.egg-info → sdg_hub-0.3.0}/PKG-INFO +3 -1
  3. sdg_hub-0.3.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +59 -0
  4. sdg_hub-0.3.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +76 -0
  5. sdg_hub-0.3.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +588 -0
  6. sdg_hub-0.3.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +578 -0
  7. sdg_hub-0.3.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +289 -0
  8. sdg_hub-0.3.0/examples/text_analysis/README.md +145 -0
  9. sdg_hub-0.3.0/examples/text_analysis/extract_stock_tickers.yaml +25 -0
  10. sdg_hub-0.3.0/examples/text_analysis/structured_insights_demo.ipynb +4479 -0
  11. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/pyproject.toml +5 -0
  12. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/_version.py +3 -3
  13. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/client_manager.py +37 -25
  14. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +12 -9
  15. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/text_parser_block.py +88 -21
  16. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/__init__.py +2 -0
  17. sdg_hub-0.3.0/src/sdg_hub/core/blocks/transform/json_structure_block.py +142 -0
  18. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/base.py +199 -56
  19. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/datautils.py +27 -2
  20. sdg_hub-0.3.0/src/sdg_hub/core/utils/flow_metrics.py +261 -0
  21. sdg_hub-0.3.0/src/sdg_hub/core/utils/logger_config.py +61 -0
  22. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +11 -0
  23. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +159 -0
  24. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +65 -0
  25. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +161 -0
  26. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +15 -0
  27. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +21 -0
  28. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +44 -0
  29. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +104 -0
  30. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +61 -0
  31. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  32. sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  33. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/__init__.py +2 -0
  34. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +6 -0
  35. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +27 -0
  36. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +38 -0
  37. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +21 -0
  38. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +153 -0
  39. sdg_hub-0.3.0/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +21 -0
  40. sdg_hub-0.3.0/src/sdg_hub/py.typed +0 -0
  41. {sdg_hub-0.2.2 → sdg_hub-0.3.0/src/sdg_hub.egg-info}/PKG-INFO +3 -1
  42. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub.egg-info/SOURCES.txt +31 -0
  43. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub.egg-info/requires.txt +2 -0
  44. sdg_hub-0.3.0/tests/__init__.py +0 -0
  45. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/llm/test_llm_chat_block.py +66 -25
  46. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +111 -105
  47. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/llm/test_textparserblock.py +645 -66
  48. sdg_hub-0.3.0/tests/blocks/transform/test_json_structure_block.py +303 -0
  49. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_base.py +95 -0
  50. sdg_hub-0.3.0/tests/utils/test_datautils.py +132 -0
  51. sdg_hub-0.2.2/src/sdg_hub/core/utils/logger_config.py +0 -20
  52. sdg_hub-0.2.2/tests/utils/test_datautils.py +0 -43
  53. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/actionlint.yaml +0 -0
  54. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/actions/free-disk-space/action.yml +0 -0
  55. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/dependabot.yml +0 -0
  56. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/mergify.yml +0 -0
  57. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/actionlint.dockerfile +0 -0
  58. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/actionlint.yml +0 -0
  59. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/docs.yml +0 -0
  60. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/e2e.yml +0 -0
  61. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/lint.yml +0 -0
  62. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/matchers/actionlint.json +0 -0
  63. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/matchers/pylint.json +0 -0
  64. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.github/workflows/test.yml +0 -0
  65. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.gitignore +0 -0
  66. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.isort.cfg +0 -0
  67. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.markdownlint-cli2.yaml +0 -0
  68. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.pre-commit-config.yaml +0 -0
  69. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/.pylintrc +0 -0
  70. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/CLAUDE.md +0 -0
  71. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/CONTRIBUTING.md +0 -0
  72. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/LICENSE +0 -0
  73. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/Makefile +0 -0
  74. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/README.md +0 -0
  75. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/.nojekyll +0 -0
  76. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/README.md +0 -0
  77. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/_coverpage.md +0 -0
  78. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/_navbar.md +0 -0
  79. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/_sidebar.md +0 -0
  80. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/api-reference.md +0 -0
  81. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/custom-blocks.md +0 -0
  82. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/evaluation-blocks.md +0 -0
  83. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/filtering-blocks.md +0 -0
  84. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/llm-blocks.md +0 -0
  85. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/overview.md +0 -0
  86. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/blocks/transform-blocks.md +0 -0
  87. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/concepts.md +0 -0
  88. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/development.md +0 -0
  89. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/flows/discovery.md +0 -0
  90. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/flows/overview.md +0 -0
  91. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/index.html +0 -0
  92. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/installation.md +0 -0
  93. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/docs/quick-start.md +0 -0
  94. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/annotation/annotation_classification.ipynb +0 -0
  95. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
  96. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/annotation/news_classification_flow.yaml +0 -0
  97. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/annotation/news_classification_prompt.yaml +0 -0
  98. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
  99. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  100. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/README.md +0 -0
  101. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  102. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  103. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  104. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  105. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  106. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  107. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  108. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  109. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  110. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  111. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  112. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/examples/knowledge_tuning/knowledge_utils.py +0 -0
  113. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/scripts/ruff.sh +0 -0
  114. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/setup.cfg +0 -0
  115. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/__init__.py +0 -0
  116. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/__init__.py +0 -0
  117. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/__init__.py +0 -0
  118. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/base.py +0 -0
  119. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -0
  120. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -0
  121. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -0
  122. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -0
  123. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -0
  124. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -0
  125. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -0
  126. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -0
  127. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -0
  128. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -0
  129. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/evaluation/__init__.py +0 -0
  130. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -0
  131. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -0
  132. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -0
  133. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  134. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  135. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  136. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/config.py +0 -0
  137. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  138. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
  139. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  140. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/registry.py +0 -0
  141. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  142. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  143. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  144. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
  145. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  146. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  147. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/__init__.py +0 -0
  148. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  149. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/metadata.py +0 -0
  150. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/migration.py +0 -0
  151. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/registry.py +0 -0
  152. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/flow/validation.py +0 -0
  153. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/__init__.py +0 -0
  154. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/error_handling.py +0 -0
  155. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  156. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  157. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  158. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  159. {sdg_hub-0.2.2/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab → sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa}/__init__.py +0 -0
  160. {sdg_hub-0.2.2/tests → sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary}/__init__.py +0 -0
  161. /sdg_hub-0.2.2/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md → /sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  162. /sdg_hub-0.2.2/src/sdg_hub/py.typed → /sdg_hub-0.3.0/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  163. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  164. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  165. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  166. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  167. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  168. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  169. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -0
  170. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  171. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  172. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/src/sdg_hub.egg-info/top_level.txt +0 -0
  173. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/deprecated/test_llmblock.py +0 -0
  174. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/evaluation/__init__.py +0 -0
  175. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/evaluation/test_evaluate_faithfulness_block.py +0 -0
  176. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/evaluation/test_evaluate_relevancy_block.py +0 -0
  177. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/evaluation/test_verify_question_block.py +0 -0
  178. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  179. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  180. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/test_base_block.py +0 -0
  181. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/test_registry.py +0 -0
  182. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_config.yaml +0 -0
  183. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_evaluate_faithfulness.yaml +0 -0
  184. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_evaluate_relevancy.yaml +0 -0
  185. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  186. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  187. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  188. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  189. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  190. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/testdata/test_verify_question.yaml +0 -0
  191. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  192. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/transform/test_melt_columns.py +0 -0
  193. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/transform/test_text_concat.py +0 -0
  194. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
  195. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
  196. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
  197. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
  198. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_renameblock.py +0 -0
  199. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
  200. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
  201. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/blocks/utilblocks/test_settomajority.py +0 -0
  202. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/__init__.py +0 -0
  203. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/conftest.py +0 -0
  204. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_checkpointer.py +0 -0
  205. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_dataset_requirements.py +0 -0
  206. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_integration.py +0 -0
  207. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_metadata.py +0 -0
  208. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_migration.py +0 -0
  209. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_registry.py +0 -0
  210. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/flow/test_validation.py +0 -0
  211. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/utils/test_error_handling.py +0 -0
  212. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tests/utils/test_path_resolution.py +0 -0
  213. {sdg_hub-0.2.2 → sdg_hub-0.3.0}/tox.ini +0 -0
@@ -78,7 +78,7 @@ jobs:
78
78
  path: dist
79
79
 
80
80
  - name: "Upload to Test PyPI"
81
- uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
81
+ uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
82
82
  with:
83
83
  repository-url: https://test.pypi.org/legacy/
84
84
 
@@ -130,4 +130,4 @@ jobs:
130
130
  rm ./dist/*.sigstore.json
131
131
 
132
132
  - name: "Upload to PyPI"
133
- uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
133
+ uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -53,6 +53,7 @@ Requires-Dist: sentence-transformers; extra == "examples"
53
53
  Requires-Dist: instructor; extra == "examples"
54
54
  Requires-Dist: fastapi; extra == "examples"
55
55
  Requires-Dist: nest-asyncio; extra == "examples"
56
+ Requires-Dist: ipykernel; extra == "examples"
56
57
  Provides-Extra: dev
57
58
  Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
58
59
  Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
@@ -63,6 +64,7 @@ Requires-Dist: pytest-cov; extra == "dev"
63
64
  Requires-Dist: pytest-html; extra == "dev"
64
65
  Requires-Dist: tox<5,>=4.4.2; extra == "dev"
65
66
  Requires-Dist: ruff; extra == "dev"
67
+ Requires-Dist: pytest-env; extra == "dev"
66
68
  Dynamic: license-file
67
69
 
68
70
  # `sdg_hub`: Synthetic Data Generation Toolkit
@@ -0,0 +1,59 @@
1
+ # SDG Knowledge Tuning Configuration
2
+ # Copy this file to .env and update the values as needed
3
+
4
+ # =============================================================================
5
+ # MODEL CONFIGURATION
6
+ # Choose one of the following model providers: hosted_vllm, openai, ollama
7
+ # =============================================================================
8
+
9
+ MODEL_PROVIDER=hosted_vllm
10
+ LITELLM_MODE=PRODUCTION
11
+
12
+ # =============================================================================
13
+ # HOSTED VLLM CONFIGURATION
14
+ # =============================================================================
15
+ VLLM_MODEL=hosted_vllm/meta-llama/Llama-3.3-70B-Instruct
16
+ VLLM_API_BASE=http://localhost:8000/v1
17
+ VLLM_API_KEY=EMPTY
18
+ ENABLE_REASONING=false
19
+ # =============================================================================
20
+ # OPENAI CONFIGURATION
21
+ # =============================================================================
22
+ OPENAI_API_KEY=your-openai-api-key-here
23
+ OPENAI_MODEL=openai/gpt-5
24
+
25
+ # =============================================================================
26
+ # OLLAMA CONFIGURATION
27
+ # =============================================================================
28
+ OLLAMA_MODEL=ollama/gemma3
29
+ OLLAMA_API_BASE=http://localhost:11434
30
+
31
+ # =============================================================================
32
+ # MAAS CONFIGURATION (Red Hat AI Services)
33
+ # =============================================================================
34
+ MAAS_MODEL=your-provisioned-model-name
35
+ MAAS_API_BASE=your-provisioned-model-url
36
+ MAAS_API_KEY=your-maas-api-key-here
37
+
38
+ # =============================================================================
39
+ # DATA CONFIGURATION
40
+ # =============================================================================
41
+ SEED_DATA_PATH=seed_data_val.jsonl
42
+ OUTPUT_DATA_FOLDER=output_data
43
+ RUN_ON_VALIDATION_SET=true
44
+ NUMBER_OF_SUMMARIES=50
45
+
46
+ # =============================================================================
47
+ # DATA MIXING CONFIGURATION
48
+ # =============================================================================
49
+ SAVE_GPT_OSS_FORMAT=false
50
+ STUDENT_MODEL=meta-llama/Llama-3.1-8B-Instruct
51
+
52
+ # Cut sizes for data mixing (comma-separated list of integers)
53
+ # Number of summaries to pick per document for each cut
54
+ CUT_SIZES=3,5,7
55
+
56
+ # Number of Q&A pairs per document
57
+ QA_PER_DOC=3
58
+
59
+ HF_TOKEN=your-hf-token
@@ -0,0 +1,76 @@
1
+ # Knowledge Tuning with Enhanced Summaries
2
+
3
+ ## Objective
4
+
5
+ Pre-trained language models typically encounter most facts in their training data only **once or twice**, if at all. As a result, knowledge of specific details—especially **proprietary or domain-specific documents**—is often incomplete or missing.
6
+
7
+ This pipeline is designed to **inject new knowledge** from a given set of documents into an instruction-tuned model. By generating **multiple document augmentations** (summaries, extractive passages, atomic facts) and **synthetic Q\&A pairs**, we repeat and reinforce important information. This repetition helps the model:
8
+
9
+ * **Memorize facts** it has rarely or never seen before.
10
+ * **Generalize across augmentations**, improving reliability when queried.
11
+ * **Adapt to proprietary knowledge sources** that were absent from pre-training.
12
+
13
+ The final product is a **high-quality training dataset** suitable for fine-tuning, enabling models to answer queries more accurately and faithfully based on the injected documents.
14
+
15
+ ---
16
+
17
+ ## 1. Document Summarization
18
+
19
+ To bootstrap the process, we generate **three complementary types of summaries** for each source document. This ensures the model captures content at multiple levels of abstraction:
20
+
21
+ * **Detailed Summaries** – Rich, comprehensive overviews of the document.
22
+ * **Extractive Summaries** – Directly extracted sentences and passages representing the most important parts.
23
+ * **Atomic Facts** – Concise, standalone factual statements distilled from the text.
24
+
25
+ This multi-perspective approach improves the model’s ability to **memorize, generalize, and recall** key knowledge.
26
+
27
+ ---
28
+
29
+ ## 2. Synthetic Q\&A Generation
30
+
31
+ With summaries in place, we scale up training data via **synthetic Q\&A generation**:
32
+
33
+ * Users provide a small set of **seed examples** (initial Q\&A pairs).
34
+ * The pipeline uses these seeds to generate a large set of **contextually grounded Q\&A pairs**, tightly linked to the summarized documents.
35
+ * This expands sparse seed data into a **rich, diverse training dataset** suitable for fine-tuning.
36
+
37
+ ---
38
+
39
+ ## 3. Quality Control
40
+
41
+ High-quality training data is essential. To ensure faithfulness and accuracy, we employ a **teacher-model evaluation loop**:
42
+
43
+ 1. Provide the model with a generated answer and the original document.
44
+ 2. Ask it to extract each factual claim from the answer.
45
+ 3. Verify whether each claim is **explicitly supported** by the document.
46
+
47
+ Only claims passing this check are retained. This process filters out **hallucinations and unsupported statements**, ensuring reliable Q\&A pairs.
48
+
49
+ ---
50
+
51
+ ## Data Generation Statistics
52
+
53
+ ### Summary Augmentation
54
+
55
+ Each “cut” represents the total number of summaries generated per document across all three augmentation types.
56
+
57
+ | Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
58
+ | ------------------------------- | ----------- |
59
+ | 1 | 2,193,502 |
60
+ | 2 | 4,383,655 |
61
+ | 5 | 10,870,396 |
62
+ | 10 | 21,815,170 |
63
+ | 20 | 43,601,976 |
64
+ | 30 | 65,395,710 |
65
+ | 40 | 87,118,308 |
66
+ | 50 | 108,779,213 |
67
+
68
+ ---
69
+
70
+ ### Finance Bench Example
71
+
72
+ For Finance Bench (NUMBER\_OF\_SUMMARIES = 1):
73
+
74
+ | Cut | Token Count |
75
+ | --- | ----------- |
76
+ | 50 | 213,333,192 |