sdg-hub 0.1.0a3__tar.gz → 0.1.0a4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/actionlint.dockerfile +1 -1
  2. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/actionlint.yml +1 -3
  3. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/docs.yml +1 -3
  4. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/e2e.yml +2 -2
  5. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/pypi.yaml +5 -5
  6. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/test.yml +9 -2
  7. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.markdownlint-cli2.yaml +2 -0
  8. sdg_hub-0.1.0a4/CONTRIBUTING.md +23 -0
  9. sdg_hub-0.1.0a4/PKG-INFO +309 -0
  10. sdg_hub-0.1.0a4/README.md +273 -0
  11. sdg_hub-0.1.0a4/examples/instructlab/skills/README.md +142 -0
  12. sdg_hub-0.1.0a4/examples/instructlab/skills/flows/synth_grounded_skills.yaml +80 -0
  13. sdg_hub-0.1.0a4/examples/instructlab/skills/flows/synth_skills.yaml +59 -0
  14. sdg_hub-0.1.0a4/examples/instructlab/skills/mdtable_manipulation.ipynb +372 -0
  15. sdg_hub-0.1.0a4/examples/instructlab/skills/sample_data/mdtable_manipulation_seeds.jsonl +5 -0
  16. sdg_hub-0.1.0a4/examples/instructlab/skills/sample_data/unstructured_to_mdtable_seeds.jsonl +5 -0
  17. sdg_hub-0.1.0a4/examples/instructlab/skills/unstructured_to_mdtable.ipynb +325 -0
  18. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/_version.py +2 -2
  19. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/llmblock.py +35 -18
  20. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/contexts.yaml +21 -0
  21. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +44 -0
  22. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +46 -0
  23. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +54 -0
  24. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
  25. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/freeform_questions.yaml +29 -0
  26. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/freeform_responses.yaml +45 -0
  27. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/grounded_questions.yaml +38 -0
  28. sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/grounded_responses.yaml +59 -0
  29. sdg_hub-0.1.0a4/src/sdg_hub/prompts.py +22 -0
  30. sdg_hub-0.1.0a4/src/sdg_hub.egg-info/PKG-INFO +309 -0
  31. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub.egg-info/SOURCES.txt +19 -3
  32. sdg_hub-0.1.0a4/tests/blocks/test_llmblock.py +343 -0
  33. sdg_hub-0.1.0a4/tests/testdata/test_config.yaml +7 -0
  34. sdg_hub-0.1.0a3/PKG-INFO +0 -154
  35. sdg_hub-0.1.0a3/README.md +0 -118
  36. sdg_hub-0.1.0a3/examples/instructlab/skills/sample_data/mdtable_seeds.jsonl +0 -5
  37. sdg_hub-0.1.0a3/examples/instructlab/skills/unstructed_to_structured.ipynb +0 -423
  38. sdg_hub-0.1.0a3/examples/instructlab/skills/unstructed_to_structured_lls.ipynb +0 -578
  39. sdg_hub-0.1.0a3/src/sdg_hub/prompts.py +0 -17
  40. sdg_hub-0.1.0a3/src/sdg_hub.egg-info/PKG-INFO +0 -154
  41. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/actionlint.yaml +0 -0
  42. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/actions/free-disk-space/action.yml +0 -0
  43. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/dependabot.yml +0 -0
  44. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/mergify.yml +0 -0
  45. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/lint.yml +0 -0
  46. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/matchers/actionlint.json +0 -0
  47. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.github/workflows/matchers/pylint.json +0 -0
  48. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.gitignore +0 -0
  49. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.isort.cfg +0 -0
  50. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.pre-commit-config.yaml +0 -0
  51. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/.pylintrc +0 -0
  52. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/LICENSE +0 -0
  53. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/MANIFEST.in +0 -0
  54. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/Makefile +0 -0
  55. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/assets/imgs/IL_skills_pipeline.png +0 -0
  56. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/assets/imgs/customized_nano_closed_book_rag_results.png +0 -0
  57. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/assets/imgs/instructlab-banner.png +0 -0
  58. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/assets/imgs/overview.png +0 -0
  59. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/data-generation-with-llama-70b/data-generation-with-llama-70b.ipynb +0 -0
  60. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/data-generation-with-llama-70b/synth_knowledge1.5_llama3.3.yaml +0 -0
  61. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/inference_time_scaling/prm_with_vllm.ipynb +0 -0
  62. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/annotation/sample_data/emotion_classification.jsonl +0 -0
  63. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  64. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  65. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  66. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/document_collection/ibm-annual-report/qna.yaml +0 -0
  67. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/document_pre_processing.ipynb +0 -0
  68. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/instructlab/knowledge/knowledge_generation_and_mixing.ipynb +0 -0
  69. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/contexts.yaml +0 -0
  70. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/evaluate_freeform_pair.yaml +0 -0
  71. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/evaluate_freeform_questions.yaml +0 -0
  72. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/evaluate_grounded_pair.yaml +0 -0
  73. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/evaluate_grounded_questions.yaml +0 -0
  74. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/freeform_questions.yaml +0 -0
  75. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/freeform_responses.yaml +0 -0
  76. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/grounded_questions.yaml +0 -0
  77. {sdg_hub-0.1.0a3/src/sdg_hub/configs/skills → sdg_hub-0.1.0a4/examples/instructlab/skills/configs}/grounded_responses.yaml +0 -0
  78. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/README.md +0 -0
  79. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/flows/synth_knowledge1.5_nemotron_super_49b.yaml +0 -0
  80. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/flows/synth_knowledge_reasoning_nemotron_super_49b.yaml +0 -0
  81. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/generate.py +0 -0
  82. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/knowledge_sdg.ipynb +0 -0
  83. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/prompts/generate_answers.yaml +0 -0
  84. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/prompts/generate_questions.yaml +0 -0
  85. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/examples/knowledge_generation_using_nemotron/prompts/generate_questions_responses.yaml +0 -0
  86. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/pyproject.toml +0 -0
  87. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/requirements-dev.txt +0 -0
  88. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/requirements.txt +0 -0
  89. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/scripts/__init__.py +0 -0
  90. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/scripts/docparser.py +0 -0
  91. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/scripts/docparser_v2.py +0 -0
  92. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/scripts/flow_runner.py +0 -0
  93. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/scripts/ruff.sh +0 -0
  94. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/setup.cfg +0 -0
  95. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/__init__.py +0 -0
  96. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/__init__.py +0 -0
  97. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/block.py +0 -0
  98. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/filterblock.py +0 -0
  99. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/iterblock.py +0 -0
  100. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/rmblocks.py +0 -0
  101. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/blocks/utilblocks.py +0 -0
  102. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/__init__.py +0 -0
  103. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/annotations/__init__.py +0 -0
  104. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/annotations/cot_reflection.yaml +0 -0
  105. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/annotations/detailed_description.yaml +0 -0
  106. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -0
  107. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/annotations/simple.yaml +0 -0
  108. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/__init__.py +0 -0
  109. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/atomic_facts.yaml +0 -0
  110. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -0
  111. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/data_recipe/__init__.py +0 -0
  112. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -0
  113. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/detailed_summary.yaml +0 -0
  114. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -0
  115. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/evaluate_question.yaml +0 -0
  116. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -0
  117. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/extractive_summary.yaml +0 -0
  118. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -0
  119. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -0
  120. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/mcq_generation.yaml +0 -0
  121. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/router.yaml +0 -0
  122. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -0
  123. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -0
  124. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_A_.yaml +0 -0
  125. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_B_.yaml +0 -0
  126. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_C_.yaml +0 -0
  127. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_D_.yaml +0 -0
  128. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_E_.yaml +0 -0
  129. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_F_.yaml +0 -0
  130. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_G_.yaml +0 -0
  131. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/_H_.yaml +0 -0
  132. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/__init__.py +0 -0
  133. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/analyzer.yaml +0 -0
  134. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/annotation.yaml +0 -0
  135. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/critic.yaml +0 -0
  136. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/data_recipe/__init__.py +0 -0
  137. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -0
  138. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/judge.yaml +0 -0
  139. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/planner.yaml +0 -0
  140. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/respond.yaml +0 -0
  141. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/revised_responder.yaml +0 -0
  142. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/router.yaml +0 -0
  143. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -0
  144. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -0
  145. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flow.py +0 -0
  146. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -0
  147. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -0
  148. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/annotation/emotion/simple.yaml +0 -0
  149. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -0
  150. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -0
  151. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -0
  152. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -0
  153. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/skills/agentic_improve_skill.yaml +0 -0
  154. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -0
  155. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -0
  156. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -0
  157. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/flows/generation/skills/synth_skills.yaml +0 -0
  158. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/logger_config.py +0 -0
  159. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/pipeline.py +0 -0
  160. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/py.typed +0 -0
  161. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/registry.py +0 -0
  162. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/sdg.py +0 -0
  163. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/utils/__init__.py +0 -0
  164. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/utils/chunking.py +0 -0
  165. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/utils/datautils.py +0 -0
  166. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/utils/docprocessor.py +0 -0
  167. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub/utils/parse_and_convert.py +0 -0
  168. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  169. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub.egg-info/requires.txt +0 -0
  170. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/src/sdg_hub.egg-info/top_level.txt +0 -0
  171. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/__init__.py +0 -0
  172. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/test_chunking.py +0 -0
  173. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/test_filterblock.py +0 -0
  174. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/test_flow.py +0 -0
  175. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/testdata/test_config_1.yaml +0 -0
  176. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/testdata/test_flow_1.yaml +0 -0
  177. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/testdata/test_flow_2.yaml +0 -0
  178. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tests/testdata/testdata.py +0 -0
  179. {sdg_hub-0.1.0a3 → sdg_hub-0.1.0a4}/tox.ini +0 -0
@@ -1,3 +1,3 @@
1
1
  # Since dependabot cannot update workflows using docker,
2
2
  # we use this indirection since dependabot can update this file.
3
- FROM rhysd/actionlint:1.7.1@sha256:435ecdb63b1169e80ca3e136290072548c07fc4d76a044cf5541021712f8f344
3
+ FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9
@@ -30,14 +30,12 @@ jobs:
30
30
  runs-on: ubuntu-latest
31
31
  steps:
32
32
  - name: "Harden Runner"
33
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
33
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
34
34
  with:
35
35
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
36
36
 
37
37
  - name: "Checkout"
38
38
  uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
39
- with:
40
- fetch-depth: 0
41
39
 
42
40
  - name: "Download actionlint"
43
41
  run: |
@@ -33,13 +33,11 @@ jobs:
33
33
  runs-on: ubuntu-latest
34
34
  steps:
35
35
  - name: "Harden Runner"
36
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
36
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
37
37
  with:
38
38
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
39
39
  - name: "Checkout"
40
40
  uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
41
- with:
42
- fetch-depth: 0
43
41
  - name: "Check Markdown documents"
44
42
  uses: DavidAnson/markdownlint-cli2-action@b4c9feab76d8025d1e83c653fa3990936df0e6c8 # v16.0.0
45
43
  with:
@@ -75,7 +75,7 @@ jobs:
75
75
  # config contains DEFAULT_MODEL
76
76
  key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
77
77
 
78
- - name: Install instructlab and instructlab-sdg
78
+ - name: Install instructlab and sdg_hub
79
79
  run: |
80
80
  export PATH="/home/runner/.local/bin:/usr/local/cuda/bin:$PATH"
81
81
  python3 -m venv venv
@@ -89,7 +89,7 @@ jobs:
89
89
  # install instructlab
90
90
  python3 -m pip install .
91
91
  cd ..
92
- # Install instructlab-sdg
92
+ # Install sdg_hub
93
93
  python3 -m pip install .
94
94
 
95
95
  - name: Run e2e test
@@ -37,7 +37,7 @@ jobs:
37
37
  runs-on: ubuntu-latest
38
38
  steps:
39
39
  - name: "Harden Runner"
40
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
40
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
41
41
  with:
42
42
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
43
43
 
@@ -67,12 +67,12 @@ jobs:
67
67
 
68
68
  steps:
69
69
  - name: "Harden Runner"
70
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
70
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
71
71
  with:
72
72
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
73
73
 
74
74
  - name: "Download build artifacts"
75
- uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9
75
+ uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
76
76
  with:
77
77
  name: Packages
78
78
  path: dist
@@ -99,12 +99,12 @@ jobs:
99
99
 
100
100
  steps:
101
101
  - name: "Harden Runner"
102
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
102
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
103
103
  with:
104
104
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
105
105
 
106
106
  - name: "Download build artifacts"
107
- uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9
107
+ uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
108
108
  with:
109
109
  name: Packages
110
110
  path: dist
@@ -51,7 +51,7 @@ jobs:
51
51
  platform: "macos-latest"
52
52
  steps:
53
53
  - name: "Harden Runner"
54
- uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
54
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
55
55
  with:
56
56
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
57
57
 
@@ -102,12 +102,19 @@ jobs:
102
102
 
103
103
  - name: Run unit tests with tox
104
104
  run: |
105
- tox
105
+ tox -e py3-unitcov
106
106
 
107
107
  - name: Remove llama-cpp-python from cache
108
108
  if: always()
109
109
  run: |
110
110
  pip cache remove llama_cpp_python
111
+
112
+ - name: Upload coverage to Codecov
113
+ uses: codecov/codecov-action@v4
114
+ with:
115
+ token: ${{ secrets.CODECOV_TOKEN }}
116
+ file: ./coverage.xml
117
+ fail_ci_if_error: true
111
118
 
112
119
  test-workflow-complete:
113
120
  needs: ["test"]
@@ -17,3 +17,5 @@ ignores:
17
17
  - "venv/**"
18
18
  - ".venv/**"
19
19
  - ".tox/**"
20
+ - "examples/**"
21
+ - "!examples/**/README.md"
@@ -0,0 +1,23 @@
1
+ # Contributing to SDG Hub
2
+
3
+ This is a guide for getting started on contributing to SDG Hub.
4
+
5
+ ## Dev Requirements
6
+
7
+ Ensure you have installed the necessary dev dependencies by running `pip install -r requirements-dev.txt` in your dev environment.
8
+
9
+ ## Linting
10
+
11
+ SDG Hub uses a Makefile for linting.
12
+
13
+ - CI changes should pass the Action linter - you can run this via `make actionlint`
14
+
15
+ - Docs changes should pass the Markdown linter - you can run this via `make md-lint`
16
+
17
+ - Code changes should pass the Code linter - you can run this via `make verify`
18
+
19
+ ## Testing
20
+
21
+ SDG Hub uses [tox](https://tox.wiki/) for test automation and [pytest](https://docs.pytest.org/) as a test framework.
22
+
23
+ You can run all tests by simply running the `tox -e py3-unit` command.
@@ -0,0 +1,309 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.1.0a4
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: click<9.0.0,>=8.1.7
27
+ Requires-Dist: datasets<4.0.0,>=2.18.0
28
+ Requires-Dist: httpx<1.0.0,>=0.25.0
29
+ Requires-Dist: jinja2
30
+ Requires-Dist: langchain-text-splitters
31
+ Requires-Dist: openai<2.0.0,>=1.13.3
32
+ Requires-Dist: rich
33
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
35
+ Dynamic: license-file
36
+
37
+ # sdg_hub: Synthetic Data Generation Toolkit for LLMs
38
+
39
+ ![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)
40
+ ![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)
41
+ ![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)
42
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
43
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
44
+
45
+ sdg_hub is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
46
+
47
+
48
+ ## Installation
49
+
50
+ Latest release from PyPI
51
+
52
+ ```sh
53
+ pip install sdg-hub
54
+ ```
55
+
56
+ Latest main branch
57
+ ```sh
58
+ pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
59
+ ```
60
+
61
+ ## Core Design Principles
62
+
63
+ The framework is built around the following principles:
64
+
65
+ 1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
66
+ 2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
67
+ 3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
68
+
69
+ ---
70
+
71
+ ## Framework Architecture
72
+
73
+ ![overview](assets/imgs/overview.png)
74
+
75
+ ### Blocks: The Fundamental Unit
76
+
77
+ At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
78
+
79
+ - Making LLM calls
80
+ - Performing data transformations
81
+ - Applying filters
82
+
83
+ Blocks are designed to be:
84
+ - **Modular**: Reusable across multiple pipelines.
85
+ - **Composable**: Easily chained together to create workflows.
86
+
87
+ These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
88
+
89
+ ### Prompts
90
+
91
+ Prompts are at the core of how LLMs are instructed within SDG Hub. Each `LLMBlock` is associated with a prompt configuration file written in YAML, allowing users to define the exact behavior of the language model — including system instructions, generation principles, and output formatting.
92
+
93
+ #### Prompt YAML Structure
94
+
95
+ A typical prompt YAML file looks like this:
96
+
97
+ ```yaml
98
+ system: You are a helpful assistant that can summarize text.
99
+ introduction: Give me a short summary of the text.
100
+ principles:
101
+ - Do not add any new information.
102
+ - Do not miss any key points from the provided text.
103
+ examples:
104
+ - input: Red Hat announced the acquisition of Neural Magic...
105
+ output: Red Hat acquired Neural Magic to enhance its AI optimization capabilities.
106
+ generation: Here is the document to summarize: {{document}}
107
+ ```
108
+
109
+ #### Key Fields
110
+ * `system`: A high-level instruction that sets the persona or behavior of the model.
111
+ * `introduction`: Optional introduction to set context for the user.
112
+ * `principles`: A list of guiding constraints or rules the model should follow during generation.
113
+ * `examples`: Few-shot examples (optional) to guide output format or tone.
114
+ * `generation`: The actual template used to generate the model input. This supports variable injection using {{variable_name}}.
115
+
116
+ ### YAML-Based Workflow: The Flow
117
+
118
+ The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
119
+
120
+ #### Key Features of a Flow
121
+
122
+ 1. **Modular Design**:
123
+ - Flows are composed of blocks, which can be chained together into pipelines.
124
+ - Each block performs a specific task, such as generating, filtering, or transforming data.
125
+
126
+ 2. **Reusability**:
127
+ - Blocks and configurations defined in a Flow can be reused across different workflows.
128
+ - YAML makes it easy to tweak or extend workflows without significant changes.
129
+
130
+ 3. **Ease of Configuration**:
131
+ - Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
132
+
133
+
134
+
135
+ ## Hello World Example
136
+
137
+ Let’s say you have a document and want to generate a concise summary using an LLM. Here’s how simple that is in sdg\_hub:
138
+
139
+ ```yaml
140
+ - block_type: LLMBlock
141
+ block_config:
142
+ block_name: gen_summary
143
+ config_path: prompts/summarization.yaml
144
+ model_id: meta-llama/Llama-3.3-70B-Instruct
145
+ output_cols:
146
+ - summary
147
+ gen_kwargs:
148
+ max_tokens: 512
149
+ ```
150
+
151
+ Want to go further? Add another block to extract keywords from the summary:
152
+
153
+ ```yaml
154
+ - block_type: LLMBlock
155
+ block_config:
156
+ block_name: gen_keywords
157
+ config_path: prompts/keywords.yaml
158
+ model_id: meta-llama/Llama-3.3-70B-Instruct
159
+ output_cols:
160
+ - keywords
161
+ gen_kwargs:
162
+ max_tokens: 64
163
+ ```
164
+
165
+ Just like that, you’ve built a multi-step LLM workflow using nothing but YAML.
166
+
167
+ ## Available Blocks
168
+
169
+ The SDG Framework provides a rich set of blocks for different data processing needs. Here's a comprehensive overview of the available blocks and when to use them:
170
+
171
+ ### Base Block Class
172
+
173
+ The framework is built around the abstract `Block` class, which serves as the foundation for all other blocks:
174
+
175
+ - **Purpose**: Provides core functionality and interface for all blocks
176
+ - **Key Features**:
177
+ - Template validation for input data
178
+ - Configuration loading from YAML files
179
+ - Standardized block initialization
180
+ - Common interface for all blocks
181
+ - **Core Methods**:
182
+ - `_validate`: Validates input data against templates
183
+ - `_load_config`: Loads configuration from YAML files
184
+ - `generate`: Abstract method for block execution
185
+
186
+ All blocks inherit from this base class, ensuring consistent behavior and interface across the framework.
187
+
188
+ ### LLM Blocks
189
+
190
+ 1. **LLMBlock**
191
+ - **Purpose**: Generate text using language models
192
+ - **Use Cases**:
193
+ - Generating questions, responses, or any text content
194
+ - Single-prompt generation with structured outputs
195
+ - **Features**:
196
+ - Supports batched processing
197
+ - Configurable output parsing
198
+ - Template-based prompt generation
199
+
200
+ 2. **ConditionalLLMBlock**
201
+ - **Purpose**: Generate text based on conditional logic
202
+ - **Use Cases**:
203
+ - Different prompt templates based on input conditions
204
+ - Multi-path text generation workflows
205
+ - **Features**:
206
+ - Multiple config paths for different conditions
207
+ - Dynamic prompt selection
208
+
209
+ 3. **LLMLogProbBlock**
210
+ - **Purpose**: Generate text with log probabilities
211
+ - **Use Cases**:
212
+ - Analyzing model confidence
213
+ - Quality scoring of generations
214
+ - **Features**:
215
+ - Returns top-k log probabilities
216
+ - JSON-formatted output
217
+
218
+ 4. **LLMMessagesBlock**
219
+ - **Purpose**: Chat-based text generation
220
+ - **Use Cases**:
221
+ - Multi-turn conversations
222
+ - Chat-based interactions
223
+ - **Features**:
224
+ - Supports message history
225
+ - Chat completion API
226
+
227
+ ### Filtering and Processing Blocks
228
+
229
+ 1. **FilterByValueBlock**
230
+ - **Purpose**: Filter datasets based on column values
231
+ - **Use Cases**:
232
+ - Removing unwanted samples
233
+ - Data cleaning
234
+ - Quality filtering
235
+ - **Features**:
236
+ - Multiple filter operations
237
+ - Type conversion support
238
+ - Parallel processing
239
+
240
+ 2. **IterBlock**
241
+ - **Purpose**: Iterative processing of data
242
+ - **Use Cases**:
243
+ - Multiple generation attempts
244
+ - Iterative refinement
245
+ - **Features**:
246
+ - Configurable number of iterations
247
+ - Nested block execution
248
+
249
+
250
+
251
+ ### Utility Blocks
252
+
253
+ 1. **SamplePopulatorBlock**
254
+ - **Purpose**: Populate samples with configuration data
255
+ - **Use Cases**:
256
+ - Adding metadata
257
+ - Configuration injection
258
+
259
+ 2. **SelectorBlock**
260
+ - **Purpose**: Select data based on mapping
261
+ - **Use Cases**:
262
+ - Conditional data selection
263
+ - Data routing
264
+
265
+ 3. **CombineColumnsBlock**
266
+ - **Purpose**: Merge multiple columns
267
+ - **Use Cases**:
268
+ - Text concatenation
269
+ - Feature combination
270
+
271
+ 4. **FlattenColumnsBlock**
272
+ - **Purpose**: Convert wide to long format
273
+ - **Use Cases**:
274
+ - Data reshaping
275
+ - Variable-value pairs
276
+
277
+ 5. **DuplicateColumns**
278
+ - **Purpose**: Create column copies
279
+ - **Use Cases**:
280
+ - Data preservation
281
+ - Multiple processing paths
282
+
283
+ 6. **RenameColumns**
284
+ - **Purpose**: Rename dataset columns
285
+ - **Use Cases**:
286
+ - Standardizing column names
287
+ - Data reorganization
288
+
289
+ 7. **SetToMajorityValue**
290
+ - **Purpose**: Replace values with majority
291
+ - **Use Cases**:
292
+ - Data normalization
293
+ - Outlier handling
294
+
295
+ ---
296
+ ### Dataflow and Storage
297
+
298
+ - **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
299
+ - Native parallelization capabilities (e.g., maps, filters).
300
+ - Support for efficient data transformations.
301
+
302
+ - **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
303
+ - Resume workflows from the last successful state if interrupted.
304
+ - Improve reliability for long-running workflows.
305
+
306
+
307
+ ## Examples
308
+
309
+ For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.