sdg-hub 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/pypi.yaml +1 -1
  2. {sdg_hub-0.1.1/src/sdg_hub.egg-info → sdg_hub-0.1.3}/PKG-INFO +2 -2
  3. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/blocks.md +178 -0
  4. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_utils.py +77 -11
  5. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/pyproject.toml +1 -1
  6. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/_version.py +2 -2
  7. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/blocks/__init__.py +6 -0
  8. sdg_hub-0.1.3/src/sdg_hub/blocks/openaichatblock.py +556 -0
  9. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/simple_annotations.yaml +1 -1
  10. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/evaluate_relevancy.yaml +1 -2
  11. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flow.py +21 -18
  12. sdg_hub-0.1.3/src/sdg_hub/flow_runner.py +450 -0
  13. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +1 -1
  14. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +1 -1
  15. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +4 -4
  16. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +8 -13
  17. sdg_hub-0.1.3/src/sdg_hub/prompts.py +74 -0
  18. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/utils/__init__.py +5 -0
  19. sdg_hub-0.1.3/src/sdg_hub/utils/error_handling.py +94 -0
  20. sdg_hub-0.1.3/src/sdg_hub/utils/path_resolution.py +62 -0
  21. {sdg_hub-0.1.1 → sdg_hub-0.1.3/src/sdg_hub.egg-info}/PKG-INFO +2 -2
  22. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub.egg-info/SOURCES.txt +6 -0
  23. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub.egg-info/requires.txt +1 -1
  24. sdg_hub-0.1.3/tests/blocks/test_openaichatblock.py +647 -0
  25. sdg_hub-0.1.3/tests/test_flowrunner.py +899 -0
  26. sdg_hub-0.1.3/tests/utils/test_error_handling.py +242 -0
  27. sdg_hub-0.1.3/tests/utils/test_path_resolution.py +223 -0
  28. sdg_hub-0.1.1/src/sdg_hub/flow_runner.py +0 -216
  29. sdg_hub-0.1.1/src/sdg_hub/prompts.py +0 -43
  30. sdg_hub-0.1.1/tests/test_flowrunner.py +0 -455
  31. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/actionlint.yaml +0 -0
  32. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/actions/free-disk-space/action.yml +0 -0
  33. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/dependabot.yml +0 -0
  34. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/mergify.yml +0 -0
  35. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/actionlint.dockerfile +0 -0
  36. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/actionlint.yml +0 -0
  37. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/docs.yml +0 -0
  38. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/e2e.yml +0 -0
  39. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/lint.yml +0 -0
  40. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/matchers/actionlint.json +0 -0
  41. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/matchers/pylint.json +0 -0
  42. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.github/workflows/test.yml +0 -0
  43. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.gitignore +0 -0
  44. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.isort.cfg +0 -0
  45. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.markdownlint-cli2.yaml +0 -0
  46. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.pre-commit-config.yaml +0 -0
  47. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/.pylintrc +0 -0
  48. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/CLAUDE.md +0 -0
  49. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/CONTRIBUTING.md +0 -0
  50. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/LICENSE +0 -0
  51. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/MANIFEST.in +0 -0
  52. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/Makefile +0 -0
  53. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/README.md +0 -0
  54. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/assets/imgs/IL_skills_pipeline.png +0 -0
  55. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/assets/imgs/fig-workflow.png +0 -0
  56. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/assets/imgs/instructlab-banner.png +0 -0
  57. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/assets/imgs/overview.png +0 -0
  58. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/.nojekyll +0 -0
  59. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/README.md +0 -0
  60. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/_coverpage.md +0 -0
  61. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/_navbar.md +0 -0
  62. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/_sidebar.md +0 -0
  63. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/architecture.md +0 -0
  64. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/changelog.md +0 -0
  65. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/configuration.md +0 -0
  66. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/development.md +0 -0
  67. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/examples.md +0 -0
  68. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/index.html +0 -0
  69. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/installation.md +0 -0
  70. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/prompts.md +0 -0
  71. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/quick-start.md +0 -0
  72. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/docs/web-interface.md +0 -0
  73. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/data-generation-with-llama-70b/data-generation-with-llama-70b.ipynb +0 -0
  74. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/data-generation-with-llama-70b/synth_knowledge1.5_llama3.3.yaml +0 -0
  75. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  76. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  77. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  78. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  79. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  80. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  81. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  82. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  83. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/README.md +0 -0
  84. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/assets/customized_nano_quality_results.png +0 -0
  85. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/blocks/blocks.py +0 -0
  86. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge1.5_nemotron_super_49b.yaml +0 -0
  87. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b.yaml +0 -0
  88. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_rewrite_with_diversity.yaml +0 -0
  89. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity.yaml +0 -0
  90. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity_cot.yaml +0 -0
  91. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/generate.py +0 -0
  92. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers.yaml +0 -0
  93. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers_cot.yaml +0 -0
  94. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_doc_rewrite_inst.yaml +0 -0
  95. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_document_rewrite.yaml +0 -0
  96. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions.yaml +0 -0
  97. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions_responses.yaml +0 -0
  98. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary.yaml +0 -0
  99. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary_inst.yaml +0 -0
  100. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg.ipynb +0 -0
  101. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_data_mixing.ipynb +0 -0
  102. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_financebench.ipynb +0 -0
  103. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/utils.py +0 -0
  104. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/README.md +0 -0
  105. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/annotation_classification.ipynb +0 -0
  106. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/blocks/__init__.py +0 -0
  107. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/blocks/add_question.py +0 -0
  108. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/blocks/docling_parse_pdf.py +0 -0
  109. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/blocks/json_format.py +0 -0
  110. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/flows/detailed_annotation.yaml +0 -0
  111. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/flows/grounded_summary_extraction.yaml +0 -0
  112. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/flows/simple_annotation.yaml +0 -0
  113. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/flows/unstructured_to_structured.yaml +0 -0
  114. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/prompts/keywords.yaml +0 -0
  115. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/prompts/named_entities.yaml +0 -0
  116. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/prompts/sentiment.yaml +0 -0
  117. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/prompts/summary.yaml +0 -0
  118. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/09b5b62d328d3d0719b6825357fdfb48.pdf +0 -0
  119. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/0d631e444d1c22f0be99a69f5deaff94.pdf +0 -0
  120. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1270f7f67f406b52a2ee86584b452bff.pdf +0 -0
  121. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/14f3d2486b21e639a953afb7ad03d90c.pdf +0 -0
  122. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1689b94530eca82b7758c86b4cf3125f.pdf +0 -0
  123. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/171fd9df333ddd814c764843ed624121.pdf +0 -0
  124. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1949bd0c9c4c23d495d880c4c552bfe1.pdf +0 -0
  125. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/2b626b620ef42f716c6028c74ee4187b.pdf +0 -0
  126. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3877b1983229ec488c6349a188bccf92.pdf +0 -0
  127. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3bc6d3e1c0a117340d288c289bf7f679.pdf +0 -0
  128. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3e714a49937be1672aa48244ba7254ce.pdf +0 -0
  129. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/6064088db0200b32f3f3e848047c5ab6.pdf +0 -0
  130. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/73c60e60043b8775dac929320839a8c6.pdf +0 -0
  131. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/77423f08f0208d476dea73c639f6293a.pdf +0 -0
  132. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/78cf0d3e40caba622d8914916f0f9146.pdf +0 -0
  133. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/7a29e2dcd505f944b16d1e3173cb1c01.pdf +0 -0
  134. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8c1b4f4af2af2847a240041390e31399.pdf +0 -0
  135. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8cd753ed00aeee0ed32d03823eef3f7e.pdf +0 -0
  136. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/a24a661c2eb55542903c72391ec09f9b.pdf +0 -0
  137. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b3d7bc295d09d9927e465213612c0192.pdf +0 -0
  138. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b7050f62f52a3d2803beea21404f7af6.pdf +0 -0
  139. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b9b40b0c1e92fb226067bdceacbdab5c.pdf +0 -0
  140. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c20824ea6f927fe380f48a904cf4821b.pdf +0 -0
  141. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c2bad61ce58687fad602549f6048004b.pdf +0 -0
  142. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf +0 -0
  143. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/da879f8ea1c23aa6565cccaacac271fc.pdf +0 -0
  144. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/e52e6870e8a04339ef969543fc0f0329.pdf +0 -0
  145. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ecd8e1f1c0fa27dfdd24b358cb65012f.pdf +0 -0
  146. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/f28832481653818f8062a497655fb09e.pdf +0 -0
  147. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ff898f396d49760343d08575ea773b54.pdf +0 -0
  148. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts.jsonl +0 -0
  149. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/table_manipulation_qna.yaml +0 -0
  150. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/seed_data/unstructured_to_structured_qna.yaml +0 -0
  151. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/structured_summary.ipynb +0 -0
  152. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/table_manipulation.ipynb +0 -0
  153. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/examples/skills_tuning/instructlab/unstructured_to_structured.ipynb +0 -0
  154. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/scripts/__init__.py +0 -0
  155. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/scripts/ruff.sh +0 -0
  156. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/setup.cfg +0 -0
  157. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/__init__.py +0 -0
  158. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/blocks/block.py +0 -0
  159. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/blocks/llmblock.py +0 -0
  160. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/blocks/utilblocks.py +0 -0
  161. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/checkpointer.py +0 -0
  162. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/__init__.py +0 -0
  163. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/__init__.py +0 -0
  164. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/cot_reflection.yaml +0 -0
  165. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/detailed_annotations.yaml +0 -0
  166. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/detailed_description.yaml +0 -0
  167. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -0
  168. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/__init__.py +0 -0
  169. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/atomic_facts.yaml +0 -0
  170. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -0
  171. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/detailed_summary.yaml +0 -0
  172. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -0
  173. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/evaluate_question.yaml +0 -0
  174. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/extractive_summary.yaml +0 -0
  175. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -0
  176. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/generate_questions.yaml +0 -0
  177. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -0
  178. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/generate_responses.yaml +0 -0
  179. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/mcq_generation.yaml +0 -0
  180. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/router.yaml +0 -0
  181. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -0
  182. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/reasoning/__init__.py +0 -0
  183. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -0
  184. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/__init__.py +0 -0
  185. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/analyzer.yaml +0 -0
  186. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/annotation.yaml +0 -0
  187. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/contexts.yaml +0 -0
  188. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/critic.yaml +0 -0
  189. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -0
  190. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -0
  191. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -0
  192. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -0
  193. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/freeform_questions.yaml +0 -0
  194. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/freeform_responses.yaml +0 -0
  195. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/grounded_questions.yaml +0 -0
  196. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/grounded_responses.yaml +0 -0
  197. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -0
  198. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  199. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/coding.yaml +0 -0
  200. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -0
  201. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -0
  202. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/math.yaml +0 -0
  203. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -0
  204. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -0
  205. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/icl_examples/writing.yaml +0 -0
  206. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/judge.yaml +0 -0
  207. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/planner.yaml +0 -0
  208. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/respond.yaml +0 -0
  209. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/revised_responder.yaml +0 -0
  210. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/router.yaml +0 -0
  211. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -0
  212. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -0
  213. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/skills/improve_responses.yaml +0 -0
  214. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -0
  215. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -0
  216. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -0
  217. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/flows/generation/skills/synth_skills.yaml +0 -0
  218. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/logger_config.py +0 -0
  219. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/pipeline.py +0 -0
  220. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/py.typed +0 -0
  221. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/registry.py +0 -0
  222. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/sdg.py +0 -0
  223. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/utils/config_validation.py +0 -0
  224. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/utils/datautils.py +0 -0
  225. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub/utils/validation_result.py +0 -0
  226. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  227. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/src/sdg_hub.egg-info/top_level.txt +0 -0
  228. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/__init__.py +0 -0
  229. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/test_llmblock.py +0 -0
  230. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/testdata/test_config.yaml +0 -0
  231. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
  232. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
  233. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_filterblock.py +0 -0
  234. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
  235. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_renameblock.py +0 -0
  236. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
  237. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
  238. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/blocks/utilblocks/test_settomajority.py +0 -0
  239. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/test_flow.py +0 -0
  240. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/test_flow_column_validation.py +0 -0
  241. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/test_flow_path.py +0 -0
  242. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/test_flow_validation.py +0 -0
  243. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/testdata/test_config_1.yaml +0 -0
  244. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/testdata/test_flow_1.yaml +0 -0
  245. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/flows/testdata/test_flow_2.yaml +0 -0
  246. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/test_checkpointer.py +0 -0
  247. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/test_pipeline.py +0 -0
  248. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/test_sdg.py +0 -0
  249. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tests/utils/test_config_validation.py +0 -0
  250. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/tox.ini +0 -0
  251. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/README.md +0 -0
  252. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/app.py +0 -0
  253. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/static/css/style.css +0 -0
  254. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/static/js/app.js +0 -0
  255. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/templates/index.html +0 -0
  256. {sdg_hub-0.1.1 → sdg_hub-0.1.3}/web_interface/test_block_types.py +0 -0
@@ -110,7 +110,7 @@ jobs:
110
110
  path: dist
111
111
 
112
112
  - name: "Sigstore sign package"
113
- uses: sigstore/gh-action-sigstore-python@f514d46b907ebcd5bedc05145c03b69c1edd8b46 # v3.0.0
113
+ uses: sigstore/gh-action-sigstore-python@f7ad0af51a5648d09a20d00370f0a91c3bdf8f84 # v3.0.1
114
114
  with:
115
115
  inputs: |
116
116
  ./dist/*.tar.gz
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -36,7 +36,7 @@ Requires-Dist: flask>=3.0.2; extra == "web-interface"
36
36
  Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
37
37
  Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
38
38
  Provides-Extra: vllm
39
- Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
39
+ Requires-Dist: vllm>=0.9.1; extra == "vllm"
40
40
  Requires-Dist: torch>=2.0.0; extra == "vllm"
41
41
  Requires-Dist: transformers>=4.37.0; extra == "vllm"
42
42
  Requires-Dist: accelerate>=0.21.0; extra == "vllm"
@@ -22,6 +22,184 @@ Blocks are the fundamental processing units in SDG Hub. Each block performs a sp
22
22
 
23
23
  ## LLM Blocks
24
24
 
25
+ ### OpenAIChatBlock
26
+ - **Registered Name**: `OpenAIChatBlock`
27
+ - **Purpose**: Modern chat completion block using OpenAI Chat Completions API
28
+ - **Key Features**:
29
+ - Direct OpenAI message format support (system/user/assistant roles)
30
+ - All OpenAI Chat Completions API parameters supported
31
+ - Automatic retry logic for rate limits and API errors
32
+ - Comprehensive structured logging for monitoring
33
+ - Works with any OpenAI-compatible endpoint
34
+
35
+ **Parameters:**
36
+ - `block_name: str` - Name of the block
37
+ - `input_cols: Union[str, List[str]]` - Input column containing messages (must be exactly one)
38
+ - `output_cols: Union[str, List[str]]` - Output column for responses (must be exactly one)
39
+ - `client: openai.OpenAI` - OpenAI client instance
40
+ - `model_id: str` - Model ID to use (e.g., "gpt-4", "gpt-3.5-turbo")
41
+ - **OpenAI API Parameters** (all optional):
42
+ - `frequency_penalty: Optional[float]` - Penalize frequent tokens (-2.0 to 2.0)
43
+ - `logit_bias: Optional[Dict[str, int]]` - Modify likelihood of specified tokens
44
+ - `logprobs: Optional[bool]` - Whether to return log probabilities
45
+ - `max_completion_tokens: Optional[int]` - Maximum tokens in completion
46
+ - `max_tokens: Optional[int]` - Maximum tokens in completion (legacy)
47
+ - `n: Optional[int]` - Number of completions to generate
48
+ - `presence_penalty: Optional[float]` - Penalize repeated tokens (-2.0 to 2.0)
49
+ - `response_format: Optional[Dict[str, Any]]` - Response format (e.g., JSON mode)
50
+ - `seed: Optional[int]` - Seed for deterministic outputs
51
+ - `stop: Optional[Union[str, List[str]]]` - Stop sequences
52
+ - `stream: Optional[bool]` - Whether to stream responses
53
+ - `temperature: Optional[float]` - Sampling temperature (0.0 to 2.0)
54
+ - `tool_choice: Optional[Union[str, Dict[str, Any]]]` - Tool selection strategy
55
+ - `tools: Optional[List[Dict[str, Any]]]` - Available tools for function calling
56
+ - `top_logprobs: Optional[int]` - Number of top log probabilities to return
57
+ - `top_p: Optional[float]` - Nucleus sampling parameter (0.0 to 1.0)
58
+ - `user: Optional[str]` - End-user identifier
59
+ - `extra_body: Optional[dict]` - Additional parameters for custom endpoints
60
+
61
+ **Example Usage:**
62
+ ```yaml
63
+ - block_type: OpenAIChatBlock
64
+ block_config:
65
+ block_name: chat_generator
66
+ input_cols: messages
67
+ output_cols: response
68
+ model_id: gpt-4
69
+ temperature: 0.7
70
+ max_tokens: 500
71
+ ```
72
+
73
+ **Example with Messages Dataset:**
74
+ ```python
75
+ import openai
76
+ from datasets import Dataset
77
+ from sdg_hub.blocks import OpenAIChatBlock
78
+
79
+ # Create client
80
+ client = openai.OpenAI(api_key="your-api-key")
81
+
82
+ # Prepare dataset with messages in OpenAI format
83
+ messages_data = [
84
+ [
85
+ {"role": "system", "content": "You are a helpful assistant."},
86
+ {"role": "user", "content": "Explain quantum computing in simple terms."}
87
+ ],
88
+ [
89
+ {"role": "user", "content": "What is the capital of France?"}
90
+ ]
91
+ ]
92
+ dataset = Dataset.from_dict({"messages": messages_data})
93
+
94
+ # Create and use block
95
+ block = OpenAIChatBlock(
96
+ block_name="qa_generator",
97
+ input_cols="messages",
98
+ output_cols="response",
99
+ client=client,
100
+ model_id="gpt-4",
101
+ temperature=0.7,
102
+ max_tokens=150
103
+ )
104
+
105
+ result = block.generate(dataset)
106
+ print(result["response"])
107
+ ```
108
+
109
+ ### OpenAIAsyncChatBlock
110
+ - **Registered Name**: `OpenAIAsyncChatBlock`
111
+ - **Purpose**: Async version of OpenAIChatBlock for concurrent processing and better performance
112
+ - **Key Features**:
113
+ - Concurrent async requests for improved throughput
114
+ - All features of OpenAIChatBlock
115
+ - Better performance for large batches
116
+ - Automatic concurrency management
117
+
118
+ **Parameters:**
119
+ - Same as `OpenAIChatBlock` except:
120
+ - `async_client: openai.AsyncOpenAI` - Async OpenAI client instance
121
+
122
+ **Example Usage:**
123
+ ```yaml
124
+ - block_type: OpenAIAsyncChatBlock
125
+ block_config:
126
+ block_name: async_chat_generator
127
+ input_cols: messages
128
+ output_cols: response
129
+ model_id: gpt-4
130
+ temperature: 0.7
131
+ max_tokens: 500
132
+ ```
133
+
134
+ **Example with Async Client:**
135
+ ```python
136
+ import asyncio
137
+ import openai
138
+ from datasets import Dataset
139
+ from sdg_hub.blocks import OpenAIAsyncChatBlock
140
+
141
+ # Create async client
142
+ async_client = openai.AsyncOpenAI(api_key="your-api-key")
143
+
144
+ # Same dataset format as sync version
145
+ messages_data = [
146
+ [{"role": "user", "content": f"Generate a creative story about topic {i}"}]
147
+ for i in range(100) # Large batch for demonstration
148
+ ]
149
+ dataset = Dataset.from_dict({"messages": messages_data})
150
+
151
+ # Create and use async block
152
+ block = OpenAIAsyncChatBlock(
153
+ block_name="async_story_generator",
154
+ input_cols="messages",
155
+ output_cols="story",
156
+ async_client=async_client,
157
+ model_id="gpt-4",
158
+ temperature=0.8,
159
+ max_tokens=200
160
+ )
161
+
162
+ # Process large batch concurrently
163
+ result = block.generate(dataset)
164
+ print(f"Generated {len(result)} stories concurrently")
165
+ ```
166
+
167
+ **OpenAI-Compatible Endpoints:**
168
+ Both blocks work with any OpenAI-compatible endpoint:
169
+
170
+ ```python
171
+ # Example with local endpoint
172
+ client = openai.OpenAI(
173
+ api_key="not-needed-for-local",
174
+ base_url="http://localhost:8000/v1"
175
+ )
176
+
177
+ # Example with other providers (Azure, Anthropic, etc.)
178
+ client = openai.OpenAI(
179
+ api_key="your-provider-key",
180
+ base_url="https://your-provider-endpoint.com/v1"
181
+ )
182
+ ```
183
+
184
+ **Monitoring and Logging:**
185
+ Both blocks provide comprehensive structured logging:
186
+ - Initialization logs with model and parameters
187
+ - Generation start/completion logs with batch metrics
188
+ - Effective parameter tracking (including runtime overrides)
189
+ - Error tracking and retry information
190
+
191
+ Log output example:
192
+ ```
193
+ INFO: Initialized OpenAIChatBlock 'chat_generator' with model 'gpt-4'
194
+ {"block_name": "chat_generator", "model_id": "gpt-4", "generation_params": {"temperature": 0.7}}
195
+
196
+ INFO: Starting generation for 10 samples
197
+ {"block_name": "chat_generator", "model_id": "gpt-4", "batch_size": 10, "effective_params": {"temperature": 0.9}}
198
+
199
+ INFO: Generation completed successfully for 10 samples
200
+ {"block_name": "chat_generator", "model_id": "gpt-4", "batch_size": 10}
201
+ ```
202
+
25
203
  ### LLMBlock
26
204
  - **Registered Name**: `LLMBlock`
27
205
  - **Purpose**: Core block for text generation using language models
@@ -1,25 +1,25 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  # Standard
4
- import json
5
- import random
6
- import uuid
7
- import os
8
- import yaml
9
4
  from pathlib import Path
10
5
  from typing import List
6
+ import json
7
+ import os
8
+ import random
11
9
  import re
10
+ import uuid
12
11
 
13
12
  # Third Party
14
- from datasets import Dataset
13
+ from datasets import Dataset, concatenate_datasets
14
+ from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
15
15
  from tabulate import tabulate
16
16
  from transformers import AutoTokenizer
17
- from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
17
+ import yaml
18
18
 
19
- # Local
20
- import sdg_hub
19
+ # First Party
21
20
  from sdg_hub.logger_config import setup_logger
22
21
  from sdg_hub.utils.datautils import safe_concatenate_datasets
22
+ import sdg_hub
23
23
 
24
24
  logger = setup_logger(__name__)
25
25
  _DEFAULT_CHUNK_OVERLAP = 100
@@ -98,9 +98,70 @@ def _conv_pretrain(rec):
98
98
  return rec
99
99
 
100
100
 
101
+ def mask_qa_per_doc(ds: Dataset, keep_no_qa_per_doc: int = 3) -> Dataset:
102
+ """
103
+ Mark QA entries per document for pre-training vs fine-tuning.
104
+
105
+ Parameters
106
+ ----------
107
+ ds : Dataset
108
+ Input dataset containing documents and QA pairs
109
+ keep_no_qa_per_doc : int, default=3
110
+ Number of QA entries per document to mark as unmask (pre-training)
111
+
112
+ Returns
113
+ -------
114
+ Dataset
115
+ Dataset with added 'unmask' boolean column indicating pre-training entries
116
+ """
117
+
118
+ unmask_entries = []
119
+ mask_entries = []
120
+ doc_count = {}
121
+
122
+ for i, doc in enumerate(ds["document"]):
123
+ if doc not in doc_count:
124
+ doc_count[doc] = 1
125
+ else:
126
+ doc_count[doc] += 1
127
+
128
+ entry = ds[i].copy()
129
+ if doc_count[doc] <= keep_no_qa_per_doc:
130
+ entry["unmask"] = True
131
+ unmask_entries.append(entry)
132
+ else:
133
+ entry["unmask"] = False
134
+ mask_entries.append(entry)
135
+
136
+ ds_new = concatenate_datasets(
137
+ [Dataset.from_list(unmask_entries), Dataset.from_list(mask_entries)]
138
+ )
139
+ return ds_new
140
+
141
+
101
142
  def generate_knowledge_qa_dataset(
102
- generated_dataset: Dataset, keep_context_separate=False, keep_document_outline=False
143
+ generated_dataset: Dataset,
144
+ keep_context_separate: bool = False,
145
+ keep_document_outline: bool = False,
146
+ keep_columns: List[str] = None,
147
+ filter_non_pre_training: bool = True,
148
+ keep_no_qa_per_doc: int = 3,
103
149
  ):
150
+ generated_dataset = generated_dataset.map(
151
+ lambda x: {
152
+ "response": x["response"]
153
+ .replace("[END]", "")
154
+ .replace("[ANSWER]", "")
155
+ .strip()
156
+ },
157
+ num_proc=10,
158
+ )
159
+ generated_dataset = mask_qa_per_doc(
160
+ generated_dataset, keep_no_qa_per_doc=keep_no_qa_per_doc
161
+ )
162
+ if filter_non_pre_training:
163
+ generated_dataset = generated_dataset.filter(lambda x: x["unmask"])
164
+
104
165
  def __create_qa_row(rec):
105
166
  context = rec["document"]
106
167
  instruction = rec["question"]
@@ -146,7 +207,12 @@ def generate_knowledge_qa_dataset(
146
207
  return {"messages": messages, "metadata": metadata, "id": str(uuid.uuid4())}
147
208
 
148
209
  knowledge_ds = generated_dataset.map(
149
- __create_qa_row, remove_columns=generated_dataset.column_names
210
+ __create_qa_row,
211
+ remove_columns=[
212
+ e
213
+ for e in generated_dataset.column_names
214
+ if e not in keep_columns + ["unmask"]
215
+ ],
150
216
  )
151
217
  return knowledge_ds
152
218
 
@@ -55,7 +55,7 @@ web_interface = [
55
55
  "flask-wtf>=1.2.2",
56
56
  ]
57
57
  vllm = [
58
- "vllm>=0.8.0,<0.8.4",
58
+ "vllm>=0.9.1",
59
59
  "torch>=2.0.0",
60
60
  "transformers>=4.37.0",
61
61
  "accelerate>=0.21.0",
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.1'
21
- __version_tuple__ = version_tuple = (0, 1, 1)
20
+ __version__ = version = '0.1.3'
21
+ __version_tuple__ = version_tuple = (0, 1, 3)
@@ -6,6 +6,10 @@ This package provides various block implementations for data generation, process
6
6
  # Local
7
7
  from .block import Block
8
8
  from .llmblock import LLMBlock, ConditionalLLMBlock
9
+ from .openaichatblock import (
10
+ OpenAIChatBlock,
11
+ OpenAIAsyncChatBlock
12
+ )
9
13
  from .utilblocks import (
10
14
  SamplePopulatorBlock,
11
15
  SelectorBlock,
@@ -33,4 +37,6 @@ __all__ = [
33
37
  "RenameColumns",
34
38
  "SetToMajorityValue",
35
39
  "BlockRegistry",
40
+ "OpenAIChatBlock",
41
+ "OpenAIAsyncChatBlock"
36
42
  ]