sdg-hub 0.1.0a4__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/docs.yml +1 -1
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/e2e.yml +1 -1
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/pypi.yaml +1 -1
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.gitignore +9 -0
- sdg_hub-0.1.1/CLAUDE.md +100 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/CONTRIBUTING.md +8 -1
- sdg_hub-0.1.1/PKG-INFO +190 -0
- sdg_hub-0.1.1/README.md +131 -0
- sdg_hub-0.1.1/assets/imgs/fig-workflow.png +0 -0
- sdg_hub-0.1.1/docs/README.md +51 -0
- sdg_hub-0.1.1/docs/_coverpage.md +11 -0
- sdg_hub-0.1.1/docs/_navbar.md +5 -0
- sdg_hub-0.1.1/docs/_sidebar.md +27 -0
- sdg_hub-0.1.1/docs/architecture.md +149 -0
- sdg_hub-0.1.1/docs/blocks.md +359 -0
- sdg_hub-0.1.1/docs/changelog.md +82 -0
- sdg_hub-0.1.1/docs/configuration.md +201 -0
- sdg_hub-0.1.1/docs/development.md +367 -0
- sdg_hub-0.1.1/docs/examples.md +191 -0
- sdg_hub-0.1.1/docs/index.html +47 -0
- sdg_hub-0.1.1/docs/installation.md +100 -0
- sdg_hub-0.1.1/docs/prompts.md +370 -0
- sdg_hub-0.1.1/docs/quick-start.md +128 -0
- sdg_hub-0.1.1/docs/web-interface.md +230 -0
- {sdg_hub-0.1.0a4/examples → sdg_hub-0.1.1/examples/knowledge_tuning}/data-generation-with-llama-70b/data-generation-with-llama-70b.ipynb +5 -33
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/document_pre_processing.ipynb +1 -1
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/knowledge_generation_and_mixing.ipynb +6 -2
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/README.md +311 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/assets/customized_nano_quality_results.png +0 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/blocks/blocks.py +60 -0
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/flows/synth_knowledge1.5_nemotron_super_49b.yaml +2 -2
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/flows/synth_knowledge_reasoning_nemotron_super_49b.yaml +9 -19
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_rewrite_with_diversity.yaml +118 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity.yaml +118 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity_cot.yaml +118 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers_cot.yaml +31 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_doc_rewrite_inst.yaml +25 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_document_rewrite.yaml +20 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary.yaml +20 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary_inst.yaml +24 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg.ipynb +1251 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_data_mixing.ipynb +471 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_financebench.ipynb +1078 -0
- sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/utils.py +121 -0
- sdg_hub-0.1.0a4/src/sdg_hub/utils/docprocessor.py → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_utils.py +306 -20
- {sdg_hub-0.1.0a4/examples/instructlab/skills → sdg_hub-0.1.1/examples/skills_tuning/instructlab}/README.md +24 -48
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/annotation_classification.ipynb +543 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/blocks/__init__.py +3 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/blocks/add_question.py +91 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/blocks/docling_parse_pdf.py +91 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/blocks/json_format.py +133 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/flows/detailed_annotation.yaml +16 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/flows/grounded_summary_extraction.yaml +53 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/flows/simple_annotation.yaml +16 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/flows/unstructured_to_structured.yaml +47 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/prompts/keywords.yaml +29 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/prompts/named_entities.yaml +40 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/prompts/sentiment.yaml +28 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/prompts/summary.yaml +29 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/09b5b62d328d3d0719b6825357fdfb48.pdf +169 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/0d631e444d1c22f0be99a69f5deaff94.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1270f7f67f406b52a2ee86584b452bff.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/14f3d2486b21e639a953afb7ad03d90c.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1689b94530eca82b7758c86b4cf3125f.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/171fd9df333ddd814c764843ed624121.pdf +150 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1949bd0c9c4c23d495d880c4c552bfe1.pdf +131 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/2b626b620ef42f716c6028c74ee4187b.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3877b1983229ec488c6349a188bccf92.pdf +207 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3bc6d3e1c0a117340d288c289bf7f679.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3e714a49937be1672aa48244ba7254ce.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/6064088db0200b32f3f3e848047c5ab6.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/73c60e60043b8775dac929320839a8c6.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/77423f08f0208d476dea73c639f6293a.pdf +169 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/78cf0d3e40caba622d8914916f0f9146.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/7a29e2dcd505f944b16d1e3173cb1c01.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8c1b4f4af2af2847a240041390e31399.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8cd753ed00aeee0ed32d03823eef3f7e.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/a24a661c2eb55542903c72391ec09f9b.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b3d7bc295d09d9927e465213612c0192.pdf +150 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b7050f62f52a3d2803beea21404f7af6.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b9b40b0c1e92fb226067bdceacbdab5c.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c20824ea6f927fe380f48a904cf4821b.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c2bad61ce58687fad602549f6048004b.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf +112 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/da879f8ea1c23aa6565cccaacac271fc.pdf +169 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/e52e6870e8a04339ef969543fc0f0329.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ecd8e1f1c0fa27dfdd24b358cb65012f.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/f28832481653818f8062a497655fb09e.pdf +74 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ff898f396d49760343d08575ea773b54.pdf +93 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts.jsonl +30 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/table_manipulation_qna.yaml +97 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/seed_data/unstructured_to_structured_qna.yaml +49 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/structured_summary.ipynb +555 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/table_manipulation.ipynb +1034 -0
- sdg_hub-0.1.1/examples/skills_tuning/instructlab/unstructured_to_structured.ipynb +591 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/pyproject.toml +47 -5
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/_version.py +2 -2
- sdg_hub-0.1.1/src/sdg_hub/blocks/__init__.py +36 -0
- sdg_hub-0.1.1/src/sdg_hub/blocks/block.py +96 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/blocks/llmblock.py +121 -193
- sdg_hub-0.1.1/src/sdg_hub/blocks/utilblocks.py +597 -0
- sdg_hub-0.1.1/src/sdg_hub/checkpointer.py +139 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/contexts.yaml +28 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/freeform_questions.yaml +34 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/freeform_responses.yaml +39 -0
- sdg_hub-0.1.1/src/sdg_hub/configs/skills/router.yaml +59 -0
- sdg_hub-0.1.1/src/sdg_hub/flow.py +474 -0
- sdg_hub-0.1.1/src/sdg_hub/flow_runner.py +216 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
- sdg_hub-0.1.0a4/src/sdg_hub/flows/generation/skills/agentic_improve_skill.yaml → sdg_hub-0.1.1/src/sdg_hub/flows/generation/skills/improve_responses.yaml +26 -31
- sdg_hub-0.1.1/src/sdg_hub/pipeline.py +121 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/prompts.py +21 -0
- sdg_hub-0.1.1/src/sdg_hub/sdg.py +206 -0
- sdg_hub-0.1.1/src/sdg_hub/utils/config_validation.py +91 -0
- sdg_hub-0.1.1/src/sdg_hub/utils/validation_result.py +10 -0
- sdg_hub-0.1.1/src/sdg_hub.egg-info/PKG-INFO +190 -0
- sdg_hub-0.1.1/src/sdg_hub.egg-info/SOURCES.txt +245 -0
- sdg_hub-0.1.1/src/sdg_hub.egg-info/requires.txt +36 -0
- sdg_hub-0.1.1/tests/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/tests/blocks/test_llmblock.py +1 -1
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_combinecolumns.py +166 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +112 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_filterblock.py +157 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_flattenblock.py +217 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_renameblock.py +101 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_samplepopulatorblock.py +195 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_selectorblock.py +146 -0
- sdg_hub-0.1.1/tests/blocks/utilblocks/test_settomajority.py +124 -0
- sdg_hub-0.1.1/tests/flows/test_flow.py +257 -0
- sdg_hub-0.1.1/tests/flows/test_flow_column_validation.py +72 -0
- sdg_hub-0.1.1/tests/flows/test_flow_path.py +109 -0
- sdg_hub-0.1.1/tests/flows/test_flow_validation.py +46 -0
- sdg_hub-0.1.1/tests/test_checkpointer.py +175 -0
- sdg_hub-0.1.1/tests/test_flowrunner.py +455 -0
- sdg_hub-0.1.1/tests/test_pipeline.py +201 -0
- sdg_hub-0.1.1/tests/test_sdg.py +413 -0
- sdg_hub-0.1.1/tests/utils/test_config_validation.py +174 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/tox.ini +4 -2
- sdg_hub-0.1.1/web_interface/README.md +133 -0
- sdg_hub-0.1.1/web_interface/app.py +227 -0
- sdg_hub-0.1.1/web_interface/static/css/style.css +171 -0
- sdg_hub-0.1.1/web_interface/static/js/app.js +518 -0
- sdg_hub-0.1.1/web_interface/templates/index.html +119 -0
- sdg_hub-0.1.1/web_interface/test_block_types.py +75 -0
- sdg_hub-0.1.0a4/PKG-INFO +0 -309
- sdg_hub-0.1.0a4/README.md +0 -273
- sdg_hub-0.1.0a4/assets/imgs/customized_nano_closed_book_rag_results.png +0 -0
- sdg_hub-0.1.0a4/examples/inference_time_scaling/prm_with_vllm.ipynb +0 -205
- sdg_hub-0.1.0a4/examples/instructlab/annotation/sample_data/emotion_classification.jsonl +0 -2000
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/contexts.yaml +0 -21
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/evaluate_freeform_pair.yaml +0 -44
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/evaluate_freeform_questions.yaml +0 -46
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/evaluate_grounded_pair.yaml +0 -54
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/freeform_questions.yaml +0 -29
- sdg_hub-0.1.0a4/examples/instructlab/skills/configs/freeform_responses.yaml +0 -45
- sdg_hub-0.1.0a4/examples/instructlab/skills/flows/synth_grounded_skills.yaml +0 -80
- sdg_hub-0.1.0a4/examples/instructlab/skills/mdtable_manipulation.ipynb +0 -372
- sdg_hub-0.1.0a4/examples/instructlab/skills/sample_data/mdtable_manipulation_seeds.jsonl +0 -5
- sdg_hub-0.1.0a4/examples/instructlab/skills/sample_data/unstructured_to_mdtable_seeds.jsonl +0 -5
- sdg_hub-0.1.0a4/examples/instructlab/skills/unstructured_to_mdtable.ipynb +0 -325
- sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron/README.md +0 -121
- sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron/knowledge_sdg.ipynb +0 -962
- sdg_hub-0.1.0a4/requirements-dev.txt +0 -12
- sdg_hub-0.1.0a4/requirements.txt +0 -14
- sdg_hub-0.1.0a4/scripts/flow_runner.py +0 -106
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/__init__.py +0 -6
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/block.py +0 -54
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/filterblock.py +0 -76
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/iterblock.py +0 -31
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/rmblocks.py +0 -194
- sdg_hub-0.1.0a4/src/sdg_hub/blocks/utilblocks.py +0 -140
- sdg_hub-0.1.0a4/src/sdg_hub/configs/annotations/simple.yaml +0 -10
- sdg_hub-0.1.0a4/src/sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/contexts.yaml +0 -21
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -44
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -46
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -54
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/freeform_questions.yaml +0 -29
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/freeform_responses.yaml +0 -45
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/router.yaml +0 -12
- sdg_hub-0.1.0a4/src/sdg_hub/flow.py +0 -144
- sdg_hub-0.1.0a4/src/sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
- sdg_hub-0.1.0a4/src/sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
- sdg_hub-0.1.0a4/src/sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
- sdg_hub-0.1.0a4/src/sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub-0.1.0a4/src/sdg_hub/pipeline.py +0 -66
- sdg_hub-0.1.0a4/src/sdg_hub/sdg.py +0 -164
- sdg_hub-0.1.0a4/src/sdg_hub/utils/chunking.py +0 -73
- sdg_hub-0.1.0a4/src/sdg_hub/utils/parse_and_convert.py +0 -392
- sdg_hub-0.1.0a4/src/sdg_hub.egg-info/PKG-INFO +0 -309
- sdg_hub-0.1.0a4/src/sdg_hub.egg-info/SOURCES.txt +0 -170
- sdg_hub-0.1.0a4/src/sdg_hub.egg-info/requires.txt +0 -9
- sdg_hub-0.1.0a4/tests/test_chunking.py +0 -52
- sdg_hub-0.1.0a4/tests/test_filterblock.py +0 -44
- sdg_hub-0.1.0a4/tests/test_flow.py +0 -74
- sdg_hub-0.1.0a4/tests/testdata/testdata.py +0 -25
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/dependabot.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/mergify.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.github/workflows/test.yml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.isort.cfg +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/.pylintrc +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/LICENSE +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/MANIFEST.in +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/Makefile +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/assets/imgs/IL_skills_pipeline.png +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/assets/imgs/overview.png +0 -0
- /sdg_hub-0.1.0a4/scripts/__init__.py → /sdg_hub-0.1.1/docs/.nojekyll +0 -0
- {sdg_hub-0.1.0a4/examples → sdg_hub-0.1.1/examples/knowledge_tuning}/data-generation-with-llama-70b/synth_knowledge1.5_llama3.3.yaml +0 -0
- {sdg_hub-0.1.0a4/scripts → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/docparser.py +0 -0
- {sdg_hub-0.1.0a4/scripts → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/docparser_v2.py +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/knowledge → sdg_hub-0.1.1/examples/knowledge_tuning/instructlab}/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/generate.py +0 -0
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/prompts/generate_answers.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/prompts/generate_questions.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/knowledge_generation_using_nemotron → sdg_hub-0.1.1/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model}/prompts/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs → sdg_hub-0.1.1/scripts}/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/scripts/ruff.sh +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/setup.cfg +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs/annotations → sdg_hub-0.1.1/src/sdg_hub/configs}/__init__.py +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs/knowledge → sdg_hub-0.1.1/src/sdg_hub/configs/annotations}/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/annotations/cot_reflection.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/annotations/detailed_description.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs/knowledge/data_recipe → sdg_hub-0.1.1/src/sdg_hub/configs/knowledge}/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/evaluate_question.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/mcq_generation.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/router.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs/skills → sdg_hub-0.1.1/src/sdg_hub/configs/reasoning}/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -0
- {sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/data_recipe → sdg_hub-0.1.1/src/sdg_hub/configs/skills}/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/analyzer.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/annotation.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/critic.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/skills/configs → sdg_hub-0.1.1/src/sdg_hub/configs/skills}/evaluate_grounded_questions.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/skills/configs → sdg_hub-0.1.1/src/sdg_hub/configs/skills}/grounded_questions.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/skills/configs → sdg_hub-0.1.1/src/sdg_hub/configs/skills}/grounded_responses.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_G_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -0
- {sdg_hub-0.1.0a4/tests → sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples}/__init__.py +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_A_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/coding.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_B_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_C_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_D_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/math.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_E_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_F_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -0
- /sdg_hub-0.1.0a4/src/sdg_hub/configs/skills/_H_.yaml → /sdg_hub-0.1.1/src/sdg_hub/configs/skills/icl_examples/writing.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/judge.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/planner.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/respond.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/revised_responder.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -0
- {sdg_hub-0.1.0a4/examples/instructlab/skills/flows → sdg_hub-0.1.1/src/sdg_hub/flows/generation/skills}/synth_skills.yaml +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/logger_config.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/registry.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/utils/__init__.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub/utils/datautils.py +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.1.0a4 → sdg_hub-0.1.1}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.1.0a4/tests → sdg_hub-0.1.1/tests/blocks}/testdata/test_config.yaml +0 -0
- {sdg_hub-0.1.0a4/tests → sdg_hub-0.1.1/tests/flows}/testdata/test_config_1.yaml +0 -0
- {sdg_hub-0.1.0a4/tests → sdg_hub-0.1.1/tests/flows}/testdata/test_flow_1.yaml +0 -0
- {sdg_hub-0.1.0a4/tests → sdg_hub-0.1.1/tests/flows}/testdata/test_flow_2.yaml +0 -0
@@ -39,6 +39,6 @@ jobs:
|
|
39
39
|
- name: "Checkout"
|
40
40
|
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
41
41
|
- name: "Check Markdown documents"
|
42
|
-
uses: DavidAnson/markdownlint-cli2-action@
|
42
|
+
uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0
|
43
43
|
with:
|
44
44
|
globs: '**/*.md'
|
@@ -49,7 +49,7 @@ jobs:
|
|
49
49
|
fetch-depth: 0
|
50
50
|
|
51
51
|
- name: "Build and Inspect"
|
52
|
-
uses: hynek/build-and-inspect-python-package@
|
52
|
+
uses: hynek/build-and-inspect-python-package@c52c3a4710070b50470d903818a7b25115dcd076 # v2.13.0
|
53
53
|
|
54
54
|
# push to Test PyPI on
|
55
55
|
# - a new GitHub release is published
|
@@ -117,6 +117,14 @@ ipython_config.py
|
|
117
117
|
.pdm-python
|
118
118
|
.pdm-build/
|
119
119
|
|
120
|
+
# UV
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
122
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
123
|
+
# commonly ignored for libraries.
|
124
|
+
uv.lock
|
125
|
+
.uv_cache/
|
126
|
+
.python-version
|
127
|
+
|
120
128
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121
129
|
__pypackages__/
|
122
130
|
|
@@ -135,6 +143,7 @@ venv/
|
|
135
143
|
ENV/
|
136
144
|
env.bak/
|
137
145
|
venv.bak/
|
146
|
+
sdg_env/
|
138
147
|
dictionary.dic
|
139
148
|
|
140
149
|
# Spyder project settings
|
sdg_hub-0.1.1/CLAUDE.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# CLAUDE.md
|
2
|
+
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
SDG Hub is a modular synthetic data generation toolkit for LLMs. The framework is built around YAML-configured flows that chain computational blocks together to process and generate data.
|
8
|
+
|
9
|
+
## Development Commands
|
10
|
+
|
11
|
+
### Code Style
|
12
|
+
- Use numpy style docstrings
|
13
|
+
- All functions and methods must include python type hints
|
14
|
+
- Write ruff-compliant code
|
15
|
+
|
16
|
+
### Testing
|
17
|
+
- Run all tests: `pytest tests/`
|
18
|
+
- Run specific test: `pytest tests/test_filename.py`
|
19
|
+
- Run tests with coverage: `tox -e py3-unitcov`
|
20
|
+
|
21
|
+
### Linting and Code Quality
|
22
|
+
- Format code: `tox -e ruff fix` or `./scripts/ruff.sh fix`
|
23
|
+
- Check code formatting: `tox -e ruff check`
|
24
|
+
- Run linting: `tox -e lint` (full pylint) or `tox -e fastlint` (faster)
|
25
|
+
- Type checking: `tox -e mypy`
|
26
|
+
- Run all checks: `make verify` (runs fastlint, mypy, ruff via tox)
|
27
|
+
|
28
|
+
### Build and Install
|
29
|
+
- Install for development: `pip install -e .[dev]`
|
30
|
+
- Install with web interface: `pip install -e .[web_interface]`
|
31
|
+
- Install with examples dependencies: `pip install -e .[examples]`
|
32
|
+
|
33
|
+
### Git Workflow
|
34
|
+
- **IMPORTANT**: Always create a feature branch and never push directly to main
|
35
|
+
- **Use git worktrees for local development**: `git worktree add ../feature-branch-name feature-branch-name`
|
36
|
+
- Create branch: `git checkout -b feature-branch-name`
|
37
|
+
- Push to branch: `git push origin feature-branch-name`
|
38
|
+
|
39
|
+
## Architecture
|
40
|
+
|
41
|
+
### Core Components
|
42
|
+
|
43
|
+
1. **Blocks** (`src/sdg_hub/blocks/`): Fundamental computational units
|
44
|
+
- `Block`: Abstract base class for all blocks
|
45
|
+
- `LLMBlock`: Language model generation blocks
|
46
|
+
- Utility blocks: filtering, data transformation, column operations
|
47
|
+
|
48
|
+
2. **Flows** (`src/sdg_hub/flow.py`): Orchestrates blocks in YAML-defined pipelines
|
49
|
+
- Loads YAML configurations
|
50
|
+
- Manages block execution order
|
51
|
+
- Handles data flow between blocks
|
52
|
+
|
53
|
+
3. **Registry System** (`src/sdg_hub/registry.py`):
|
54
|
+
- `BlockRegistry`: Manages available block types
|
55
|
+
- `PromptRegistry`: Manages prompt configurations
|
56
|
+
|
57
|
+
4. **Prompts** (`src/sdg_hub/configs/`): YAML-based LLM instruction templates
|
58
|
+
- Support Jinja2 templating with variable injection
|
59
|
+
- Include system instructions, principles, examples, and generation templates
|
60
|
+
|
61
|
+
### Data Flow
|
62
|
+
|
63
|
+
- Uses Hugging Face Datasets (Arrow tables) for data representation
|
64
|
+
- Supports checkpointing for long-running flows
|
65
|
+
- Blocks process datasets and pass results to subsequent blocks
|
66
|
+
|
67
|
+
### Flow Configuration
|
68
|
+
|
69
|
+
Flows are defined in YAML files with this structure:
|
70
|
+
```yaml
|
71
|
+
- block_type: LLMBlock
|
72
|
+
block_config:
|
73
|
+
block_name: unique_name
|
74
|
+
config_path: path/to/prompt.yaml
|
75
|
+
model_id: model_name
|
76
|
+
output_cols: [column_names]
|
77
|
+
gen_kwargs:
|
78
|
+
max_tokens: 512
|
79
|
+
```
|
80
|
+
|
81
|
+
### Block Development
|
82
|
+
|
83
|
+
When creating new blocks:
|
84
|
+
1. Inherit from `Block` base class
|
85
|
+
2. Register with `@BlockRegistry.register("BlockName")`
|
86
|
+
3. Implement `generate()` method
|
87
|
+
4. Use `_validate()` for input validation
|
88
|
+
5. Use `_load_config()` for YAML configuration loading
|
89
|
+
|
90
|
+
### Testing Conventions
|
91
|
+
|
92
|
+
- Unit tests in `tests/` directory
|
93
|
+
- Test data in `testdata/` subdirectories
|
94
|
+
- Use pytest fixtures for common test setup
|
95
|
+
- Test both positive and negative cases
|
96
|
+
- Include edge cases and error conditions
|
97
|
+
|
98
|
+
## Additional Tips
|
99
|
+
- Use `rg` in favor of `grep` whenever it's available
|
100
|
+
- Use `uv` for Python environment management: always start with `uv sync --extra dev` to init the env and run stuff with `uv run`
|
@@ -4,7 +4,14 @@ This is a guide for getting started on contributing to SDG Hub.
|
|
4
4
|
|
5
5
|
## Dev Requirements
|
6
6
|
|
7
|
-
|
7
|
+
Install the development dependencies using the optional `dev` group:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install .[dev]
|
11
|
+
```
|
12
|
+
|
13
|
+
If you’re using a fresh virtual environment, this will install both the core and development requirements declared in `pyproject.toml`.
|
14
|
+
|
8
15
|
|
9
16
|
## Linting
|
10
17
|
|
sdg_hub-0.1.1/PKG-INFO
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Environment :: Console
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
+
Requires-Python: >=3.9
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
License-File: LICENSE
|
26
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
+
Requires-Dist: jinja2
|
30
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
31
|
+
Requires-Dist: rich
|
32
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
33
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
34
|
+
Provides-Extra: web-interface
|
35
|
+
Requires-Dist: flask>=3.0.2; extra == "web-interface"
|
36
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
|
37
|
+
Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
|
38
|
+
Provides-Extra: vllm
|
39
|
+
Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
|
40
|
+
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
41
|
+
Requires-Dist: transformers>=4.37.0; extra == "vllm"
|
42
|
+
Requires-Dist: accelerate>=0.21.0; extra == "vllm"
|
43
|
+
Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
|
44
|
+
Provides-Extra: examples
|
45
|
+
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
46
|
+
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
47
|
+
Requires-Dist: langchain-text-splitters; extra == "examples"
|
48
|
+
Requires-Dist: docling>=2.3.0; extra == "examples"
|
49
|
+
Provides-Extra: dev
|
50
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
51
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
52
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
54
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
55
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
56
|
+
Requires-Dist: pytest-html; extra == "dev"
|
57
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
58
|
+
Dynamic: license-file
|
59
|
+
|
60
|
+
# SDG Hub: Synthetic Data Generation Toolkit
|
61
|
+
|
62
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
63
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
64
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
65
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
66
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
67
|
+
|
68
|
+
<html>
|
69
|
+
<h3 align="center">
|
70
|
+
A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
|
71
|
+
</h3>
|
72
|
+
<h3 align="center">
|
73
|
+
<a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
|
74
|
+
<a href="examples/">Examples</a> |
|
75
|
+
<a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
|
76
|
+
</h3>
|
77
|
+
</html>
|
78
|
+
|
79
|
+
SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
|
80
|
+
|
81
|
+
**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
|
82
|
+
|
83
|
+
---
|
84
|
+
|
85
|
+
## ✨ Key Features
|
86
|
+
|
87
|
+
- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
|
88
|
+
simple YAML configuration files without writing any code.
|
89
|
+
|
90
|
+
- **Modular Block System**: Compose workflows from reusable, self-contained
|
91
|
+
blocks that handle LLM calls, data transformations, and filtering.
|
92
|
+
|
93
|
+
- **LLM-Agnostic**: Works with any language model through configurable
|
94
|
+
prompt templates and generation parameters.
|
95
|
+
|
96
|
+
- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
|
97
|
+
|
98
|
+
## 🚀 Installation
|
99
|
+
|
100
|
+
### Stable Release (Recommended)
|
101
|
+
|
102
|
+
```bash
|
103
|
+
pip install sdg-hub
|
104
|
+
```
|
105
|
+
|
106
|
+
### Development Version
|
107
|
+
|
108
|
+
```bash
|
109
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
110
|
+
```
|
111
|
+
|
112
|
+
## 🏁 Quick Start
|
113
|
+
|
114
|
+
### Prerequisites
|
115
|
+
|
116
|
+
Before getting started, make sure you have:
|
117
|
+
- Python 3.8 or higher
|
118
|
+
- LLM Inference Endpoint exposed through OpenAI API
|
119
|
+
|
120
|
+
### Simple Example
|
121
|
+
|
122
|
+
Here's the simplest way to get started:
|
123
|
+
|
124
|
+
```python
|
125
|
+
from sdg_hub.flow_runner import run_flow
|
126
|
+
|
127
|
+
# Run a basic knowledge generation flow
|
128
|
+
run_flow(
|
129
|
+
ds_path="my_data.jsonl",
|
130
|
+
save_path="output.jsonl",
|
131
|
+
endpoint="http://0.0.0.0:8000/v1",
|
132
|
+
flow_path="flows/generation/knowledge/synth_knowledge.yaml"
|
133
|
+
)
|
134
|
+
```
|
135
|
+
|
136
|
+
### Advanced Configuration
|
137
|
+
You can invoke any built-in flow using run_flow:
|
138
|
+
```python
|
139
|
+
from sdg_hub.flow_runner import run_flow
|
140
|
+
|
141
|
+
run_flow(
|
142
|
+
ds_path="path/to/dataset.jsonl",
|
143
|
+
save_path="path/to/output.jsonl",
|
144
|
+
endpoint="http://0.0.0.0:8000/v1",
|
145
|
+
flow_path="path/to/flow.yaml",
|
146
|
+
checkpoint_dir="path/to/checkpoints",
|
147
|
+
batch_size=8,
|
148
|
+
num_workers=32,
|
149
|
+
save_freq=2,
|
150
|
+
)
|
151
|
+
```
|
152
|
+
|
153
|
+
### 📂 Available Built-in Flows
|
154
|
+
|
155
|
+
You can start with any of these YAML flows out of the box:
|
156
|
+
|
157
|
+
#### 🔎 **Knowledge Flows**
|
158
|
+
|
159
|
+
| Flow | Description |
|
160
|
+
|------|-------------|
|
161
|
+
| `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
|
162
|
+
| `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
|
163
|
+
|
164
|
+
#### 🧠 **Skills Flows**
|
165
|
+
|
166
|
+
| Flow | Description |
|
167
|
+
|------|-------------|
|
168
|
+
| `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
|
169
|
+
| `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
|
170
|
+
| `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
|
171
|
+
|
172
|
+
All these can be found here: [flows](src/sdg_hub/flows)
|
173
|
+
|
174
|
+
## 📺 Video Tutorial
|
175
|
+
|
176
|
+
For a comprehensive walkthrough of sdg_hub:
|
177
|
+
|
178
|
+
[](https://www.youtube.com/watch?v=aGKCViWjAmA)
|
179
|
+
|
180
|
+
## 🤝 Contributing
|
181
|
+
|
182
|
+
We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
|
183
|
+
|
184
|
+
## 📄 License
|
185
|
+
|
186
|
+
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
|
187
|
+
|
188
|
+
---
|
189
|
+
|
190
|
+
Built with ❤️ by the Red Hat AI Innovation Team
|
sdg_hub-0.1.1/README.md
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
# SDG Hub: Synthetic Data Generation Toolkit
|
2
|
+
|
3
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
4
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
5
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
6
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
7
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
8
|
+
|
9
|
+
<html>
|
10
|
+
<h3 align="center">
|
11
|
+
A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
|
12
|
+
</h3>
|
13
|
+
<h3 align="center">
|
14
|
+
<a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
|
15
|
+
<a href="examples/">Examples</a> |
|
16
|
+
<a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
|
17
|
+
</h3>
|
18
|
+
</html>
|
19
|
+
|
20
|
+
SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
|
21
|
+
|
22
|
+
**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
|
23
|
+
|
24
|
+
---
|
25
|
+
|
26
|
+
## ✨ Key Features
|
27
|
+
|
28
|
+
- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
|
29
|
+
simple YAML configuration files without writing any code.
|
30
|
+
|
31
|
+
- **Modular Block System**: Compose workflows from reusable, self-contained
|
32
|
+
blocks that handle LLM calls, data transformations, and filtering.
|
33
|
+
|
34
|
+
- **LLM-Agnostic**: Works with any language model through configurable
|
35
|
+
prompt templates and generation parameters.
|
36
|
+
|
37
|
+
- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
|
38
|
+
|
39
|
+
## 🚀 Installation
|
40
|
+
|
41
|
+
### Stable Release (Recommended)
|
42
|
+
|
43
|
+
```bash
|
44
|
+
pip install sdg-hub
|
45
|
+
```
|
46
|
+
|
47
|
+
### Development Version
|
48
|
+
|
49
|
+
```bash
|
50
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
51
|
+
```
|
52
|
+
|
53
|
+
## 🏁 Quick Start
|
54
|
+
|
55
|
+
### Prerequisites
|
56
|
+
|
57
|
+
Before getting started, make sure you have:
|
58
|
+
- Python 3.8 or higher
|
59
|
+
- LLM Inference Endpoint exposed through OpenAI API
|
60
|
+
|
61
|
+
### Simple Example
|
62
|
+
|
63
|
+
Here's the simplest way to get started:
|
64
|
+
|
65
|
+
```python
|
66
|
+
from sdg_hub.flow_runner import run_flow
|
67
|
+
|
68
|
+
# Run a basic knowledge generation flow
|
69
|
+
run_flow(
|
70
|
+
ds_path="my_data.jsonl",
|
71
|
+
save_path="output.jsonl",
|
72
|
+
endpoint="http://0.0.0.0:8000/v1",
|
73
|
+
flow_path="flows/generation/knowledge/synth_knowledge.yaml"
|
74
|
+
)
|
75
|
+
```
|
76
|
+
|
77
|
+
### Advanced Configuration
|
78
|
+
You can invoke any built-in flow using run_flow:
|
79
|
+
```python
|
80
|
+
from sdg_hub.flow_runner import run_flow
|
81
|
+
|
82
|
+
run_flow(
|
83
|
+
ds_path="path/to/dataset.jsonl",
|
84
|
+
save_path="path/to/output.jsonl",
|
85
|
+
endpoint="http://0.0.0.0:8000/v1",
|
86
|
+
flow_path="path/to/flow.yaml",
|
87
|
+
checkpoint_dir="path/to/checkpoints",
|
88
|
+
batch_size=8,
|
89
|
+
num_workers=32,
|
90
|
+
save_freq=2,
|
91
|
+
)
|
92
|
+
```
|
93
|
+
|
94
|
+
### 📂 Available Built-in Flows
|
95
|
+
|
96
|
+
You can start with any of these YAML flows out of the box:
|
97
|
+
|
98
|
+
#### 🔎 **Knowledge Flows**
|
99
|
+
|
100
|
+
| Flow | Description |
|
101
|
+
|------|-------------|
|
102
|
+
| `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
|
103
|
+
| `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
|
104
|
+
|
105
|
+
#### 🧠 **Skills Flows**
|
106
|
+
|
107
|
+
| Flow | Description |
|
108
|
+
|------|-------------|
|
109
|
+
| `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
|
110
|
+
| `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
|
111
|
+
| `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
|
112
|
+
|
113
|
+
All these can be found here: [flows](src/sdg_hub/flows)
|
114
|
+
|
115
|
+
## 📺 Video Tutorial
|
116
|
+
|
117
|
+
For a comprehensive walkthrough of sdg_hub:
|
118
|
+
|
119
|
+
[](https://www.youtube.com/watch?v=aGKCViWjAmA)
|
120
|
+
|
121
|
+
## 🤝 Contributing
|
122
|
+
|
123
|
+
We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
|
124
|
+
|
125
|
+
## 📄 License
|
126
|
+
|
127
|
+
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
|
128
|
+
|
129
|
+
---
|
130
|
+
|
131
|
+
Built with ❤️ by the Red Hat AI Innovation Team
|
Binary file
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# SDG Hub: Synthetic Data Generation Toolkit
|
2
|
+
|
3
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
4
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
5
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
6
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
7
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
8
|
+
|
9
|
+
## Overview
|
10
|
+
|
11
|
+
SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
|
12
|
+
|
13
|
+
## ✨ Key Features
|
14
|
+
|
15
|
+
- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using simple YAML configuration files without writing any code.
|
16
|
+
|
17
|
+
- **Modular Block System**: Compose workflows from reusable, self-contained blocks that handle LLM calls, data transformations, and filtering.
|
18
|
+
|
19
|
+
- **LLM-Agnostic**: Works with any language model through configurable prompt templates and generation parameters.
|
20
|
+
|
21
|
+
- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
|
22
|
+
|
23
|
+
## 🚀 Installation
|
24
|
+
|
25
|
+
### Stable Release (Recommended)
|
26
|
+
|
27
|
+
```bash
|
28
|
+
pip install sdg-hub
|
29
|
+
```
|
30
|
+
|
31
|
+
### Development Version
|
32
|
+
|
33
|
+
```bash
|
34
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
35
|
+
```
|
36
|
+
|
37
|
+
## 📺 Video Tutorial
|
38
|
+
|
39
|
+
For a comprehensive walkthrough of sdg_hub:
|
40
|
+
|
41
|
+
[](https://www.youtube.com/watch?v=aGKCViWjAmA)
|
42
|
+
|
43
|
+
## 🤝 Contributing
|
44
|
+
|
45
|
+
We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [development guide](development.md).
|
46
|
+
|
47
|
+
## 📄 License
|
48
|
+
|
49
|
+
This project is licensed under the Apache 2.0 License - see the [LICENSE](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE) file for details.
|
50
|
+
|
51
|
+
---
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# SDG Hub
|
2
|
+
|
3
|
+
> A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner
|
4
|
+
|
5
|
+
- 🧩 **Modular Block System** - Compose workflows from reusable, self-contained blocks
|
6
|
+
- 🚀 **Low-Code Flow Creation** - Build sophisticated pipelines using simple YAML configuration files
|
7
|
+
- 🤖 **LLM-Agnostic** - Works with any language model through configurable prompt templates
|
8
|
+
- ⚙️ **Prompt Engineering Friendly** - Tune LLM behavior by editing declarative YAML prompts
|
9
|
+
|
10
|
+
[GitHub](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub)
|
11
|
+
[Get Started](quick-start.md)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
- [Home](/)
|
2
|
+
- [Installation](installation.md)
|
3
|
+
- [Quick Start Guide](quick-start.md)
|
4
|
+
- [Configuration](configuration.md)
|
5
|
+
- [Logging](configuration.md#logging-configuration)
|
6
|
+
- [Environment Variables](configuration.md#environment-variables)
|
7
|
+
- [Flow Runner](configuration.md#flow-runner-configuration)
|
8
|
+
- [Architecture](architecture.md)
|
9
|
+
- [Core Components](architecture.md#core-components)
|
10
|
+
- [Data Flow](architecture.md#data-flow)
|
11
|
+
- [Flow Configuration](architecture.md#flow-configuration)
|
12
|
+
- [Blocks](blocks.md)
|
13
|
+
- [Available Blocks](blocks.md#available-blocks)
|
14
|
+
- [LLM Blocks](blocks.md#llm-blocks)
|
15
|
+
- [Utility Blocks](blocks.md#utility-blocks)
|
16
|
+
- [Prompts](prompts.md)
|
17
|
+
- [Configuration](prompts.md#configuration)
|
18
|
+
- [Templates](prompts.md#templates)
|
19
|
+
- [Examples](examples.md)
|
20
|
+
- [Knowledge Tuning](examples.md#knowledge-tuning)
|
21
|
+
- [Skills Tuning](examples.md#skills-tuning)
|
22
|
+
- [Web Interface](web-interface.md)
|
23
|
+
- [Development](development.md)
|
24
|
+
- [Contributing](development.md#contributing)
|
25
|
+
- [Testing](development.md#testing)
|
26
|
+
- [Commands](development.md#commands)
|
27
|
+
- [Changelog](changelog.md)
|