sdg-hub 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. {sdg_hub-0.1.2/src/sdg_hub.egg-info → sdg_hub-0.1.4}/PKG-INFO +2 -2
  2. sdg_hub-0.1.4/examples/knowledge_tuning/README.md +115 -0
  3. sdg_hub-0.1.4/examples/knowledge_tuning/instructlab/README.md +110 -0
  4. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +2 -3
  5. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_utils.py +39 -14
  6. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/pyproject.toml +1 -1
  7. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/_version.py +2 -2
  8. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/simple_annotations.yaml +1 -1
  9. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/evaluate_relevancy.yaml +1 -2
  10. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flow_runner.py +15 -2
  11. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +1 -1
  12. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +1 -1
  13. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +4 -4
  14. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +16 -28
  15. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/prompts.py +11 -5
  16. {sdg_hub-0.1.2 → sdg_hub-0.1.4/src/sdg_hub.egg-info}/PKG-INFO +2 -2
  17. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub.egg-info/SOURCES.txt +2 -0
  18. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub.egg-info/requires.txt +1 -1
  19. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/actionlint.yaml +0 -0
  20. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/actions/free-disk-space/action.yml +0 -0
  21. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/dependabot.yml +0 -0
  22. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/mergify.yml +0 -0
  23. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/actionlint.dockerfile +0 -0
  24. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/actionlint.yml +0 -0
  25. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/docs.yml +0 -0
  26. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/e2e.yml +0 -0
  27. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/lint.yml +0 -0
  28. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/matchers/actionlint.json +0 -0
  29. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/matchers/pylint.json +0 -0
  30. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/pypi.yaml +0 -0
  31. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.github/workflows/test.yml +0 -0
  32. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.gitignore +0 -0
  33. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.isort.cfg +0 -0
  34. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.markdownlint-cli2.yaml +0 -0
  35. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.pre-commit-config.yaml +0 -0
  36. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/.pylintrc +0 -0
  37. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/CLAUDE.md +0 -0
  38. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/CONTRIBUTING.md +0 -0
  39. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/LICENSE +0 -0
  40. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/MANIFEST.in +0 -0
  41. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/Makefile +0 -0
  42. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/README.md +0 -0
  43. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/assets/imgs/IL_skills_pipeline.png +0 -0
  44. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/assets/imgs/fig-workflow.png +0 -0
  45. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/assets/imgs/instructlab-banner.png +0 -0
  46. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/assets/imgs/overview.png +0 -0
  47. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/.nojekyll +0 -0
  48. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/README.md +0 -0
  49. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/_coverpage.md +0 -0
  50. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/_navbar.md +0 -0
  51. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/_sidebar.md +0 -0
  52. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/architecture.md +0 -0
  53. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/blocks.md +0 -0
  54. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/changelog.md +0 -0
  55. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/configuration.md +0 -0
  56. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/development.md +0 -0
  57. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/examples.md +0 -0
  58. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/index.html +0 -0
  59. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/installation.md +0 -0
  60. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/prompts.md +0 -0
  61. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/quick-start.md +0 -0
  62. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/docs/web-interface.md +0 -0
  63. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/data-generation-with-llama-70b/data-generation-with-llama-70b.ipynb +0 -0
  64. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/data-generation-with-llama-70b/synth_knowledge1.5_llama3.3.yaml +0 -0
  65. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  66. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  67. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  68. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  69. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  70. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  71. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  72. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/README.md +0 -0
  73. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/assets/customized_nano_quality_results.png +0 -0
  74. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/blocks/blocks.py +0 -0
  75. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge1.5_nemotron_super_49b.yaml +0 -0
  76. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b.yaml +0 -0
  77. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_rewrite_with_diversity.yaml +0 -0
  78. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity.yaml +0 -0
  79. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity_cot.yaml +0 -0
  80. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/generate.py +0 -0
  81. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers.yaml +0 -0
  82. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers_cot.yaml +0 -0
  83. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_doc_rewrite_inst.yaml +0 -0
  84. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_document_rewrite.yaml +0 -0
  85. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions.yaml +0 -0
  86. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions_responses.yaml +0 -0
  87. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary.yaml +0 -0
  88. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary_inst.yaml +0 -0
  89. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg.ipynb +0 -0
  90. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_data_mixing.ipynb +0 -0
  91. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_financebench.ipynb +0 -0
  92. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/utils.py +0 -0
  93. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/README.md +0 -0
  94. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/annotation_classification.ipynb +0 -0
  95. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/blocks/__init__.py +0 -0
  96. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/blocks/add_question.py +0 -0
  97. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/blocks/docling_parse_pdf.py +0 -0
  98. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/blocks/json_format.py +0 -0
  99. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/flows/detailed_annotation.yaml +0 -0
  100. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/flows/grounded_summary_extraction.yaml +0 -0
  101. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/flows/simple_annotation.yaml +0 -0
  102. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/flows/unstructured_to_structured.yaml +0 -0
  103. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/prompts/keywords.yaml +0 -0
  104. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/prompts/named_entities.yaml +0 -0
  105. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/prompts/sentiment.yaml +0 -0
  106. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/prompts/summary.yaml +0 -0
  107. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/09b5b62d328d3d0719b6825357fdfb48.pdf +0 -0
  108. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/0d631e444d1c22f0be99a69f5deaff94.pdf +0 -0
  109. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1270f7f67f406b52a2ee86584b452bff.pdf +0 -0
  110. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/14f3d2486b21e639a953afb7ad03d90c.pdf +0 -0
  111. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1689b94530eca82b7758c86b4cf3125f.pdf +0 -0
  112. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/171fd9df333ddd814c764843ed624121.pdf +0 -0
  113. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1949bd0c9c4c23d495d880c4c552bfe1.pdf +0 -0
  114. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/2b626b620ef42f716c6028c74ee4187b.pdf +0 -0
  115. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3877b1983229ec488c6349a188bccf92.pdf +0 -0
  116. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3bc6d3e1c0a117340d288c289bf7f679.pdf +0 -0
  117. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3e714a49937be1672aa48244ba7254ce.pdf +0 -0
  118. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/6064088db0200b32f3f3e848047c5ab6.pdf +0 -0
  119. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/73c60e60043b8775dac929320839a8c6.pdf +0 -0
  120. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/77423f08f0208d476dea73c639f6293a.pdf +0 -0
  121. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/78cf0d3e40caba622d8914916f0f9146.pdf +0 -0
  122. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/7a29e2dcd505f944b16d1e3173cb1c01.pdf +0 -0
  123. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8c1b4f4af2af2847a240041390e31399.pdf +0 -0
  124. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8cd753ed00aeee0ed32d03823eef3f7e.pdf +0 -0
  125. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/a24a661c2eb55542903c72391ec09f9b.pdf +0 -0
  126. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b3d7bc295d09d9927e465213612c0192.pdf +0 -0
  127. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b7050f62f52a3d2803beea21404f7af6.pdf +0 -0
  128. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b9b40b0c1e92fb226067bdceacbdab5c.pdf +0 -0
  129. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c20824ea6f927fe380f48a904cf4821b.pdf +0 -0
  130. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c2bad61ce58687fad602549f6048004b.pdf +0 -0
  131. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf +0 -0
  132. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/da879f8ea1c23aa6565cccaacac271fc.pdf +0 -0
  133. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/e52e6870e8a04339ef969543fc0f0329.pdf +0 -0
  134. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ecd8e1f1c0fa27dfdd24b358cb65012f.pdf +0 -0
  135. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/f28832481653818f8062a497655fb09e.pdf +0 -0
  136. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ff898f396d49760343d08575ea773b54.pdf +0 -0
  137. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts.jsonl +0 -0
  138. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/table_manipulation_qna.yaml +0 -0
  139. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/seed_data/unstructured_to_structured_qna.yaml +0 -0
  140. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/structured_summary.ipynb +0 -0
  141. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/table_manipulation.ipynb +0 -0
  142. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/examples/skills_tuning/instructlab/unstructured_to_structured.ipynb +0 -0
  143. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/scripts/__init__.py +0 -0
  144. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/scripts/ruff.sh +0 -0
  145. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/setup.cfg +0 -0
  146. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/__init__.py +0 -0
  147. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/blocks/__init__.py +0 -0
  148. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/blocks/block.py +0 -0
  149. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/blocks/llmblock.py +0 -0
  150. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/blocks/openaichatblock.py +0 -0
  151. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/blocks/utilblocks.py +0 -0
  152. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/checkpointer.py +0 -0
  153. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/__init__.py +0 -0
  154. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/__init__.py +0 -0
  155. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/cot_reflection.yaml +0 -0
  156. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/detailed_annotations.yaml +0 -0
  157. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/detailed_description.yaml +0 -0
  158. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -0
  159. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/__init__.py +0 -0
  160. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/atomic_facts.yaml +0 -0
  161. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -0
  162. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/detailed_summary.yaml +0 -0
  163. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -0
  164. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/evaluate_question.yaml +0 -0
  165. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/extractive_summary.yaml +0 -0
  166. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -0
  167. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/generate_questions.yaml +0 -0
  168. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -0
  169. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/generate_responses.yaml +0 -0
  170. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/mcq_generation.yaml +0 -0
  171. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/router.yaml +0 -0
  172. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -0
  173. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/reasoning/__init__.py +0 -0
  174. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -0
  175. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/__init__.py +0 -0
  176. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/analyzer.yaml +0 -0
  177. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/annotation.yaml +0 -0
  178. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/contexts.yaml +0 -0
  179. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/critic.yaml +0 -0
  180. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -0
  181. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -0
  182. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -0
  183. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -0
  184. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/freeform_questions.yaml +0 -0
  185. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/freeform_responses.yaml +0 -0
  186. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/grounded_questions.yaml +0 -0
  187. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/grounded_responses.yaml +0 -0
  188. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -0
  189. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  190. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/coding.yaml +0 -0
  191. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -0
  192. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -0
  193. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/math.yaml +0 -0
  194. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -0
  195. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -0
  196. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/icl_examples/writing.yaml +0 -0
  197. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/judge.yaml +0 -0
  198. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/planner.yaml +0 -0
  199. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/respond.yaml +0 -0
  200. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/revised_responder.yaml +0 -0
  201. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/router.yaml +0 -0
  202. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -0
  203. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -0
  204. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flow.py +0 -0
  205. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/skills/improve_responses.yaml +0 -0
  206. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -0
  207. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -0
  208. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -0
  209. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/flows/generation/skills/synth_skills.yaml +0 -0
  210. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/logger_config.py +0 -0
  211. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/pipeline.py +0 -0
  212. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/py.typed +0 -0
  213. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/registry.py +0 -0
  214. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/sdg.py +0 -0
  215. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/__init__.py +0 -0
  216. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/config_validation.py +0 -0
  217. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/datautils.py +0 -0
  218. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/error_handling.py +0 -0
  219. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/path_resolution.py +0 -0
  220. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub/utils/validation_result.py +0 -0
  221. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  222. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/src/sdg_hub.egg-info/top_level.txt +0 -0
  223. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/__init__.py +0 -0
  224. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/test_llmblock.py +0 -0
  225. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/test_openaichatblock.py +0 -0
  226. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/testdata/test_config.yaml +0 -0
  227. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
  228. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
  229. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_filterblock.py +0 -0
  230. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
  231. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_renameblock.py +0 -0
  232. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
  233. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
  234. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/blocks/utilblocks/test_settomajority.py +0 -0
  235. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/test_flow.py +0 -0
  236. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/test_flow_column_validation.py +0 -0
  237. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/test_flow_path.py +0 -0
  238. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/test_flow_validation.py +0 -0
  239. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/testdata/test_config_1.yaml +0 -0
  240. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/testdata/test_flow_1.yaml +0 -0
  241. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/flows/testdata/test_flow_2.yaml +0 -0
  242. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/test_checkpointer.py +0 -0
  243. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/test_flowrunner.py +0 -0
  244. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/test_pipeline.py +0 -0
  245. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/test_sdg.py +0 -0
  246. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/utils/test_config_validation.py +0 -0
  247. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/utils/test_error_handling.py +0 -0
  248. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tests/utils/test_path_resolution.py +0 -0
  249. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/tox.ini +0 -0
  250. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/README.md +0 -0
  251. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/app.py +0 -0
  252. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/static/css/style.css +0 -0
  253. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/static/js/app.js +0 -0
  254. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/templates/index.html +0 -0
  255. {sdg_hub-0.1.2 → sdg_hub-0.1.4}/web_interface/test_block_types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -36,7 +36,7 @@ Requires-Dist: flask>=3.0.2; extra == "web-interface"
36
36
  Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
37
37
  Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
38
38
  Provides-Extra: vllm
39
- Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
39
+ Requires-Dist: vllm>=0.9.1; extra == "vllm"
40
40
  Requires-Dist: torch>=2.0.0; extra == "vllm"
41
41
  Requires-Dist: transformers>=4.37.0; extra == "vllm"
42
42
  Requires-Dist: accelerate>=0.21.0; extra == "vllm"
@@ -0,0 +1,115 @@
1
+ # Synthetic Data Generation for Knowledge Tuning
2
+
3
+ ## What is Knowledge Tuning?
4
+
5
+ **Knowledge tuning** is the process of adapting a large language model (LLM) to new factual content by training it on specific documents. The goal is to enable the model to **recall and reason over document-grounded information** when performing downstream tasks such as:
6
+
7
+ * Question Answering
8
+ * Summarization
9
+ * Entity Extraction
10
+ * Other document-specific reasoning tasks
11
+
12
+ This adaptation can be used:
13
+
14
+ * As a **standalone fine-tuned model**, or
15
+ * As part of a **Retrieval-Augmented Generation (RAG)** pipeline to enhance factual accuracy and contextuality.
16
+
17
+ ---
18
+
19
+ ### Setup Instructions
20
+
21
+ #### Install sdg-hub
22
+
23
+ ```bash
24
+ pip install sdg-hub==0.1.0a4
25
+ ```
26
+
27
+ #### Install with optional dependencies
28
+
29
+ If you want to use the vLLM server, you can install it with the following command:
30
+
31
+ ```bash
32
+ pip install sdg-hub[vllm]
33
+ ```
34
+
35
+ In order to use docling, you need to install it with the following command:
36
+
37
+ ```bash
38
+ pip install sdg-hub[examples]
39
+ ```
40
+
41
+ ### Serving the Teacher Model
42
+
43
+ #### vLLM Server
44
+
45
+ Launch the vLLM server with the following command:
46
+
47
+ ```bash
48
+ vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 4
49
+ ```
50
+
51
+ ## Repository Structure
52
+
53
+ This repository demonstrates how to generate synthetic data for knowledge tuning using different approaches:
54
+
55
+ ### Examples
56
+
57
+ 1. [`instructlab/`](instructlab/):
58
+ Implements knowledge tuning using the **InstructLab** pipeline, which supports a two-phase approach:
59
+
60
+ * Phase 1: Knowledge tuning via synthetic QAs
61
+ * Phase 2: Instruction tuning to generalize reasoning skills
62
+
63
+ 2. [`knowledge_tuning_with_reasoning_model/`](knowledge_tuning_with_reasoning_model/):
64
+ Uses **Nemotron Super** as the teacher model to generate reasoning-focused synthetic data grounded in document content. We also show how to edit the knowledge pipeline to introduce new types of summaries
65
+
66
+ Each example includes:
67
+
68
+ * Source document processing
69
+ * QA generation with a teacher model
70
+ * Filtering and validation logic
71
+ * Dataset formatting for fine-tuning
72
+
73
+ 3. [`translation_example`](translation_example/):
74
+ Implements a translation block to translate article into a target language for generating knowledge QA. The example scripts show how to translate Kannada Wikipedia article into English and generate synthetic QA to train a model.
75
+ ---
76
+
77
+ ## Data Post-Processing
78
+
79
+ Once synthetic QA data is generated, you’ll need to prepare it for training:
80
+
81
+ ### Key Practices
82
+
83
+ * Append source document content to the generated QA to improve memorization and coverage.
84
+ * During training, backpropagate on both the **prompt** (document + question) and the **response** (answer).
85
+ * For `instructlab.training`, you can use the `unmask` field to enable pretraining-style loss computation over the full prompt-response.
86
+
87
+ ### Creating QA dataset
88
+
89
+ * You can use below function to transform the generated dataset into Prompt + Response pair for training in messages format.
90
+ * You can control various parameters like appending document to question, adding document outline to document etc.
91
+ ```python
92
+ from knowledge_utils import generate_knowledge_qa_dataset
93
+
94
+ knowl_train = generate_knowledge_qa_dataset(
95
+ generated_dataset=generated_data,
96
+ keep_context_separate=False,
97
+ keep_document_outline=True,
98
+ keep_columns=['document', 'document_outline', 'raw_document']
99
+ )
100
+ ```
101
+ * `keep_context_separate=False`: Includes the document in the prompt
102
+ * `keep_document_outline=True`: Adds structure to the prompt using outline
103
+ * `keep_columns`: Retains metadata for record-keeping (not used in training)
104
+
105
+
106
+ ### Workflow: InstructLab (Knowledge + RAFT)
107
+ You can find steps for data post-processing [here](instructlab/README.md#data-post-processing)
108
+
109
+ ### Workflow: Fine-tuning Instruct Model
110
+
111
+ * You can simply take the generated data and continue instruction tuning an existing instruct model (e.g. Qwen 2.5 8B instruct, LLama 3.3 8B/70B etc.)
112
+ * Simply follow [Creating QA dataset](#creating-qa-dataset) section for creating the training data.
113
+ * Note: The model might suffer catastrophic forgetting and might need a replay buffer of instruction data. Or you might need to explore alternate methods like Parameter efficient fine-tuning.
114
+
115
+
@@ -0,0 +1,110 @@
1
+ # InstructLab: Synthetic Data Generation for Knowledge Tuning
2
+ ![InstructLab Banner](../../../assets/imgs/instructlab-banner.png)
3
+
4
+ The provided notebooks show a sdg_hub pipeline for generating high-quality synthetic data from your documents. By following the methodology of the **LAB (Large-scale Alignment for Chatbots)** framework, as detailed in our [research paper](https://arxiv.org/pdf/2403.01081), you can effectively tune language models to master the knowledge contained within your specific-domain documentation.
5
+
6
+ ## How It Works: A Three-Phase Pipeline
7
+
8
+ Our data generation process is designed to operate in three distinct phases, ensuring the creation of a robust and reliable dataset for model training.
9
+
10
+ ### 1\. Document Summarization
11
+
12
+ To kickstart the process, we generate three unique summaries of your source documents. This multi-faceted approach helps the model to thoroughly memorize and recall the key information. The summaries include:
13
+
14
+ * **Detailed Summaries:** Comprehensive overviews of the content.
15
+ * **Extractive Summaries:** Key sentences and passages pulled directly from the text.
16
+ * **Atomic Facts:** A list of the most critical, standalone pieces of information.
17
+
18
+ ### 2\. Synthetic Q\&A Generation
19
+
20
+ Next, our pipeline leverages user-provided "seed examples"—sample questions and answers—to generate a wealth of synthetic Q\&A pairs. These new pairs are contextually grounded in the summarized documents, effectively scaling up your initial examples into a diverse training dataset.
21
+
22
+ ### 3\. Quality Control
23
+
24
+ To ensure the integrity of our generated data, we employ a quality-checking phase. Using a "teacher" model, we perform a faithfulness evaluation by:
25
+
26
+ 1. Providing the model with a generated answer and the original source document.
27
+ 2. Tasking the model to extract every claim made in the answer.
28
+ 3. Verifying that each claim is factually supported by the provided document.
29
+
30
+ This process filters out inaccuracies and ensures that only high-quality, faithful Q\&A pairs make it into the final dataset.
31
+
32
+ ## Getting Started
33
+
34
+ To begin using the pipeline, simply install the `sdg_hub` library. From there, you can instantiate and run the synthetic data generation process with the following code:
35
+
36
+ ```python
37
+ from sdg_hub.flow import Flow
38
+ from sdg_hub.sdg import SDG
39
+
40
+ # Load the knowledge generation pipeline from the YAML file
41
+ knowledge_agentic_pipeline = "../../../src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml"
42
+ flow = Flow(openai_client).get_flow_from_file(knowledge_agentic_pipeline)
43
+
44
+ # Initialize the Synthetic Data Generator
45
+ generator = SDG(
46
+ flows=[flow],
47
+ num_workers=1,
48
+ batch_size=1,
49
+ save_freq=1000,
50
+ )
51
+ ```
52
+
53
+ ## InstructLab Training Methodology
54
+
55
+ Our training process is structured to build upon a pre-trained model, systematically enhancing its skills and knowledge.
56
+
57
+ 1. **Foundation Training:** We begin by training a pre-trained model on foundational skills such as logic, coding, and math. The instruction data in this phase features short, direct responses.
58
+ 2. **Foundation Knowledge:** Next, we expand the model's general knowledge base, by training it on general textbooks and benchmarking it against MMLU. The result of these first two stages is what we term the **starter model**.
59
+
60
+ This **starter model** then serves as the base for our specialized, two-phase knowledge tuning:
61
+
62
+ * **Phase 1: Knowledge Tuning:** We do pre-training style training on the document generated data by our pipeline. This allows the model to internalize the new knowledge and be able to recall and answer questions based on it.
63
+ * **Phase 2: Skills Tuning:** Building on the previous phase, we do instruction tuning on general skills (combination of instruction tuning dataset and skills generated with sdg_hub). To prevent the model from forgetting the newly acquired knowledge, we mix in data from the previous stage. We also incorporate [RAFT-style](https://openreview.net/forum?id=rzQGHXNReU) data to enhance the model's robustness for RAG on the target documents.
64
+
65
+ ## Data Post-Processing
66
+
67
+ After generating your data, use the provided utility functions to prepare it for the two-phase training process. All helper functions are located in `examples/knowledge_tuning/knowledge_utils.py`.
68
+
69
+ ### 1\. Knowledge Dataset (for Phase 1)
70
+
71
+ This dataset is used for the initial knowledge-tuning phase. You can also merge datasets from multiple documents to train on a set of documents simultaneously.
72
+
73
+ This function also creates a summarization dataset that formats the generated summaries as task: document + instruction -> document summary.
74
+
75
+ ```python
76
+ from knowledge_utils import create_knowledge_pretraining_ds
77
+
78
+ # Create the dataset for knowledge pre-training
79
+ knowledge_data = create_knowledge_pretraining_ds(generated_dataset=generated_data)
80
+ ```
81
+
82
+ ### 2\. Skills Dataset (for Phase 2)
83
+
84
+ This dataset combines the knowledge-specific data with RAFT-style examples for the second phase of tuning. It can also be mixed with general instruction-tuning data to grant the model broad instruction-following abilities while retaining the specialized knowledge.
85
+
86
+ ```python
87
+ from knowledge_utils import create_knowledge_regular_ds
88
+ from datasets import concatenate_datasets
89
+
90
+ # Create the dataset with RAFT and summary data
91
+ raft_and_summary_data = create_knowledge_regular_ds(generated_dataset=generated_data)
92
+
93
+ # Create the core knowledge dataset.
94
+ # add_auxiliary_dataset parameter controls wheter to add the summarization dataset that was mentioned above in the returned dataset
95
+ knowledge_data = create_knowledge_pretraining_ds(generated_dataset=generated_data, add_auxiliary_dataset=False)
96
+
97
+ # Combine the datasets for the skills tuning phase
98
+ knowledge_skills_data = concatenate_datasets([raft_and_summary_data, knowledge_data])
99
+ ```
100
+
101
+ ---
102
+
103
+ ## Generation Statistics
104
+
105
+ Default generation parameters (based on `llama-3.3-70B`) are defined in:
106
+ [`synth_knowledge1.5.yaml`](../../src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml)
107
+
108
+ * The pipeline converts each input document into **3 summaries**
109
+ * Outputs vary based on teacher model and generation parameters (e.g. `temperature`, `top_p`, `top_k`) and can be entered in the `gen_kwargs` section of the flow.
110
+ * Generation currently uses temperature=0.0 and is deterministic.
@@ -34,7 +34,6 @@
34
34
  "\n",
35
35
  "# First Party\n",
36
36
  "from sdg_hub.flow import Flow\n",
37
- "from sdg_hub.pipeline import Pipeline\n",
38
37
  "from sdg_hub.sdg import SDG\n",
39
38
  "import sys\n",
40
39
  "import os\n",
@@ -86,9 +85,9 @@
86
85
  "outputs": [],
87
86
  "source": [
88
87
  "knowledge_agentic_pipeline = \"../../../src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml\"\n",
89
- "flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)\n",
88
+ "flow = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)\n",
90
89
  "sdg = SDG(\n",
91
- " [Pipeline(flow_cfg)],\n",
90
+ " flows=[flow],\n",
92
91
  " num_workers=1,\n",
93
92
  " batch_size=1,\n",
94
93
  " save_freq=1000,\n",
@@ -268,7 +268,22 @@ def build_raft_dataset(ds: Dataset, p, num_doc_in_context=4):
268
268
 
269
269
 
270
270
  def create_knowledge_regular_ds(generated_dataset: Dataset):
271
- # Phase 1.0
271
+ """
272
+ Create a knowledge dataset for the Skills Phase of knowledge tuning.
273
+
274
+ This function generates QA datasets with RAFT-style context separation
275
+ and optionally includes auxiliary datasets for enhanced training.
276
+
277
+ Parameters
278
+ ----------
279
+ generated_dataset : Dataset
280
+ The input dataset containing generated knowledge content
281
+
282
+ Returns
283
+ -------
284
+ Dataset
285
+ Processed dataset ready for skills phase training
286
+ """
272
287
  knowledge_ds = generate_knowledge_qa_dataset(
273
288
  generated_dataset, keep_context_separate=True
274
289
  )
@@ -276,26 +291,36 @@ def create_knowledge_regular_ds(generated_dataset: Dataset):
276
291
 
277
292
  auxiliary_dataset = create_auxiliary_dataset(generated_dataset)
278
293
  if auxiliary_dataset is not None:
279
- transformed_data = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
280
- else:
281
- transformed_data = knowledge_ds
282
- return transformed_data
294
+ knowledge_ds = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
295
+ return knowledge_ds
283
296
 
284
297
 
285
- def create_knowledge_pretraining_ds(generated_dataset: Dataset):
286
- # Phase 0.7
298
+ def create_knowledge_pretraining_ds(generated_dataset: Dataset, add_auxiliary_dataset: bool = True):
299
+ # Phase 0.7 (Knowledge Phase)
300
+ """
301
+ Create a knowledge dataset for the Knowledge Phase of knowledge tuning.
302
+
303
+ This function generates QA datasets for pretraining-style knowledge tuning
304
+ with optional auxiliary dataset inclusion.
305
+
306
+ Parameters
307
+ ----------
308
+ generated_dataset (Dataset): The dataset containing generated knowledge data.
309
+ add_auxiliary_dataset (bool): Whether to include an auxiliary dataset.
310
+
311
+ Returns
312
+ -------
313
+ Dataset: The generated knowledge dataset.
314
+ """
287
315
  knowledge_ds = generate_knowledge_qa_dataset(
288
- generated_dataset, keep_context_separate=False
289
- )
316
+ generated_dataset, keep_context_separate=False)
290
317
  knowledge_ds = knowledge_ds.map(_conv_pretrain)
291
318
 
292
319
  auxiliary_dataset = create_auxiliary_dataset(generated_dataset)
293
- if auxiliary_dataset is not None:
320
+ if auxiliary_dataset is not None and add_auxiliary_dataset:
294
321
  auxiliary_dataset = auxiliary_dataset.map(_conv_pretrain)
295
- transformed_data = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
296
- else:
297
- transformed_data = knowledge_ds
298
- return transformed_data
322
+ knowledge_ds = safe_concatenate_datasets([knowledge_ds, auxiliary_dataset])
323
+ return knowledge_ds
299
324
 
300
325
 
301
326
  def fuse_texts(text_list, short_length_threshold=100):
@@ -55,7 +55,7 @@ web_interface = [
55
55
  "flask-wtf>=1.2.2",
56
56
  ]
57
57
  vllm = [
58
- "vllm>=0.8.0,<0.8.4",
58
+ "vllm>=0.9.1",
59
59
  "torch>=2.0.0",
60
60
  "transformers>=4.37.0",
61
61
  "accelerate>=0.21.0",
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.2'
21
- __version_tuple__ = version_tuple = (0, 1, 2)
20
+ __version__ = version = '0.1.4'
21
+ __version_tuple__ = version_tuple = (0, 1, 4)
@@ -1,4 +1,4 @@
1
- system: null
1
+ system: "You are a helpful assistant that annotates text."
2
2
  introduction: "Task Description: Data Annotation"
3
3
  principles: null
4
4
  examples: null
@@ -9,7 +9,7 @@ principles: |
9
9
 
10
10
  For each question, assign a score of 1 point if the response meets the criteria, and 0 points if it does not. After evaluating each question, provide detailed feedback explaining your reasoning behind the scores awarded.
11
11
 
12
- Conclude your evaluation with a final result, strictly using the following format: 'Total Score: X'. The total score should represent the sum of points assigned for each question, with a maximum possible score of 2 points.
12
+ Conclude your evaluation with a total score as a final result. The total score should represent the sum of points assigned for each question, with a maximum possible score of 2 points.
13
13
  Only evaluate the response based on the above criteria, do not create new questions.
14
14
 
15
15
  examples: |
@@ -49,7 +49,6 @@ examples: |
49
49
  0
50
50
  [End of Score]
51
51
 
52
-
53
52
  Example 3:
54
53
  [Start of Question]
55
54
  What are the benefits of electric vehicles?
@@ -42,6 +42,7 @@ def run_flow(
42
42
  debug: bool = False,
43
43
  dataset_start_index: int = 0,
44
44
  dataset_end_index: Optional[int] = None,
45
+ api_key: Optional[str] = None,
45
46
  ) -> None:
46
47
  """Process the dataset using the specified configuration.
47
48
 
@@ -69,6 +70,8 @@ def run_flow(
69
70
  Start index for dataset slicing, by default 0.
70
71
  dataset_end_index : Optional[int], optional
71
72
  End index for dataset slicing, by default None.
73
+ api_key : Optional[str], optional
74
+ API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable, by default None.
72
75
 
73
76
  Returns
74
77
  -------
@@ -137,9 +140,9 @@ def run_flow(
137
140
  ) from e
138
141
 
139
142
  # Validate API configuration
140
- openai_api_key = os.environ.get("OPENAI_API_KEY")
143
+ openai_api_key = api_key or os.environ.get("OPENAI_API_KEY")
141
144
  if not openai_api_key or openai_api_key == "EMPTY":
142
- logger.warning("OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
145
+ logger.warning("API key not provided and OPENAI_API_KEY not set or is 'EMPTY'. API calls may fail.")
143
146
 
144
147
  openai_api_base = endpoint
145
148
  if not openai_api_base:
@@ -349,6 +352,12 @@ def run_flow(
349
352
  @click.option(
350
353
  "--dataset_end_index", type=int, default=None, help="End index of the dataset."
351
354
  )
355
+ @click.option(
356
+ "--api_key",
357
+ type=str,
358
+ default=None,
359
+ help="API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.",
360
+ )
352
361
  def main(
353
362
  ds_path: str,
354
363
  bs: int,
@@ -361,6 +370,7 @@ def main(
361
370
  debug: bool,
362
371
  dataset_start_index: int,
363
372
  dataset_end_index: Optional[int],
373
+ api_key: Optional[str],
364
374
  ) -> None:
365
375
  """CLI entry point for running data generation flows.
366
376
 
@@ -388,6 +398,8 @@ def main(
388
398
  Start index for dataset slicing.
389
399
  dataset_end_index : Optional[int]
390
400
  End index for dataset slicing.
401
+ api_key : Optional[str]
402
+ API key for the remote endpoint. If not provided, will use OPENAI_API_KEY environment variable.
391
403
 
392
404
  Returns
393
405
  -------
@@ -406,6 +418,7 @@ def main(
406
418
  debug=debug,
407
419
  dataset_start_index=dataset_start_index,
408
420
  dataset_end_index=dataset_end_index,
421
+ api_key=api_key,
409
422
  )
410
423
  except (
411
424
  DatasetLoadError,
@@ -2,7 +2,7 @@
2
2
  block_config:
3
3
  block_name: gen_mmlu_knowledge
4
4
  config_path: configs/knowledge/mcq_generation.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
6
  output_cols:
7
7
  - mmlubench_question
8
8
  - mmlubench_answer
@@ -2,7 +2,7 @@
2
2
  block_config:
3
3
  block_name: gen_knowledge
4
4
  config_path: configs/knowledge/simple_generate_qa.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
6
  output_cols:
7
7
  - output
8
8
  gen_kwargs:
@@ -2,7 +2,7 @@
2
2
  block_config:
3
3
  block_name: gen_knowledge
4
4
  config_path: configs/knowledge/generate_questions_responses.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
6
  output_cols:
7
7
  - question
8
8
  - response
@@ -20,7 +20,7 @@
20
20
  block_config:
21
21
  block_name: eval_faithfulness_qa_pair
22
22
  config_path: configs/knowledge/evaluate_faithfulness.yaml
23
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
23
+ model_id: meta-llama/Llama-3.3-70B-Instruct
24
24
  output_cols:
25
25
  - explanation
26
26
  - judgment
@@ -43,7 +43,7 @@
43
43
  block_config:
44
44
  block_name: eval_relevancy_qa_pair
45
45
  config_path: configs/knowledge/evaluate_relevancy.yaml
46
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
46
+ model_id: meta-llama/Llama-3.3-70B-Instruct
47
47
  output_cols:
48
48
  - feedback
49
49
  - score
@@ -67,7 +67,7 @@
67
67
  block_config:
68
68
  block_name: eval_verify_question
69
69
  config_path: configs/knowledge/evaluate_question.yaml
70
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
70
+ model_id: meta-llama/Llama-3.3-70B-Instruct
71
71
  output_cols:
72
72
  - explanation
73
73
  - rating
@@ -8,35 +8,31 @@
8
8
  block_config:
9
9
  block_name: gen_detailed_summary
10
10
  config_path: configs/knowledge/detailed_summary.yaml
11
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
11
+ model_id: meta-llama/Llama-3.3-70B-Instruct
12
12
  output_cols:
13
13
  - summary_detailed
14
14
  gen_kwargs:
15
- max_tokens: 4096
16
- temperature: 0.7
17
- n: 50
15
+ max_tokens: 2048
18
16
 
19
17
  - block_type: LLMBlock
20
18
  block_config:
21
19
  block_name: gen_atomic_facts
22
20
  config_path: configs/knowledge/atomic_facts.yaml
23
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
21
+ model_id: meta-llama/Llama-3.3-70B-Instruct
24
22
  output_cols:
25
23
  - summary_atomic_facts
26
24
  gen_kwargs:
27
- max_tokens: 4096
28
- temperature: 0.7
25
+ max_tokens: 2048
29
26
 
30
27
  - block_type: LLMBlock
31
28
  block_config:
32
29
  block_name: gen_extractive_summary
33
30
  config_path: configs/knowledge/extractive_summary.yaml
34
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
31
+ model_id: meta-llama/Llama-3.3-70B-Instruct
35
32
  output_cols:
36
33
  - summary_extractive
37
34
  gen_kwargs:
38
- max_tokens: 4096
39
- temperature: 0.7
35
+ max_tokens: 2048
40
36
 
41
37
  - block_type: FlattenColumnsBlock
42
38
  block_config:
@@ -59,33 +55,25 @@
59
55
  - block_type: LLMBlock
60
56
  block_config:
61
57
  block_name: knowledge generation
62
- config_path: configs/knowledge/generate_questions.yaml
63
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
58
+ config_path: configs/knowledge/generate_questions_responses.yaml
59
+ model_id: meta-llama/Llama-3.3-70B-Instruct
64
60
  output_cols:
65
61
  - question
62
+ - response
66
63
  parser_kwargs:
67
64
  parser_name: custom
68
- parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
69
- gen_kwargs:
70
- temperature: 0.7
71
- max_tokens: 100
72
-
73
- - block_type: LLMBlock
74
- block_config:
75
- block_name: knowledge generation
76
- config_path: configs/knowledge/generate_responses.yaml
77
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
78
- output_cols:
79
- - response
65
+ parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
66
+ parser_cleanup_tags:
67
+ - "[END]"
80
68
  gen_kwargs:
81
- temperature: 0.7
69
+ temperature: 0.0
82
70
  max_tokens: 2048
83
71
 
84
72
  - block_type: LLMBlock
85
73
  block_config:
86
74
  block_name: eval_faithfulness_qa_pair
87
75
  config_path: configs/knowledge/evaluate_faithfulness.yaml
88
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
76
+ model_id: meta-llama/Llama-3.3-70B-Instruct
89
77
  output_cols:
90
78
  - explanation
91
79
  - judgment
@@ -106,7 +94,7 @@
106
94
  block_config:
107
95
  block_name: eval_relevancy_qa_pair
108
96
  config_path: configs/knowledge/evaluate_relevancy.yaml
109
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
97
+ model_id: meta-llama/Llama-3.3-70B-Instruct
110
98
  output_cols:
111
99
  - feedback
112
100
  - score
@@ -128,7 +116,7 @@
128
116
  block_config:
129
117
  block_name: eval_verify_question
130
118
  config_path: configs/knowledge/evaluate_question.yaml
131
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
119
+ model_id: meta-llama/Llama-3.3-70B-Instruct
132
120
  output_cols:
133
121
  - explanation
134
122
  - rating