sdg-hub 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/pypi.yaml +1 -1
- {sdg_hub-0.1.1/src/sdg_hub.egg-info → sdg_hub-0.1.2}/PKG-INFO +1 -1
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/blocks.md +178 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_utils.py +77 -11
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/_version.py +2 -2
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/blocks/__init__.py +6 -0
- sdg_hub-0.1.2/src/sdg_hub/blocks/openaichatblock.py +556 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flow.py +21 -18
- sdg_hub-0.1.2/src/sdg_hub/flow_runner.py +437 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -5
- sdg_hub-0.1.2/src/sdg_hub/prompts.py +74 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/utils/__init__.py +5 -0
- sdg_hub-0.1.2/src/sdg_hub/utils/error_handling.py +94 -0
- sdg_hub-0.1.2/src/sdg_hub/utils/path_resolution.py +62 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2/src/sdg_hub.egg-info}/PKG-INFO +1 -1
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub.egg-info/SOURCES.txt +6 -0
- sdg_hub-0.1.2/tests/blocks/test_openaichatblock.py +647 -0
- sdg_hub-0.1.2/tests/test_flowrunner.py +899 -0
- sdg_hub-0.1.2/tests/utils/test_error_handling.py +242 -0
- sdg_hub-0.1.2/tests/utils/test_path_resolution.py +223 -0
- sdg_hub-0.1.1/src/sdg_hub/flow_runner.py +0 -216
- sdg_hub-0.1.1/src/sdg_hub/prompts.py +0 -43
- sdg_hub-0.1.1/tests/test_flowrunner.py +0 -455
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/dependabot.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/mergify.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/e2e.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.github/workflows/test.yml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.gitignore +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.isort.cfg +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/.pylintrc +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/CLAUDE.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/LICENSE +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/MANIFEST.in +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/Makefile +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/README.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/assets/imgs/IL_skills_pipeline.png +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/assets/imgs/fig-workflow.png +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/assets/imgs/overview.png +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/.nojekyll +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/README.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/_coverpage.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/_navbar.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/_sidebar.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/architecture.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/changelog.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/configuration.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/development.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/examples.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/index.html +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/installation.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/prompts.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/quick-start.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/docs/web-interface.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/data-generation-with-llama-70b/data-generation-with-llama-70b.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/data-generation-with-llama-70b/synth_knowledge1.5_llama3.3.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/README.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/assets/customized_nano_quality_results.png +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/blocks/blocks.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge1.5_nemotron_super_49b.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_rewrite_with_diversity.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/flows/synth_knowledge_reasoning_nemotron_super_49b_summary_diversity_cot.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/generate.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_answers_cot.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_doc_rewrite_inst.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_document_rewrite.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/prompts/generate_summary_inst.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_data_mixing.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/reasoning_sdg_financebench.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/knowledge_tuning/knowledge_tuning_with_reasoning_model/utils.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/annotation_classification.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/blocks/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/blocks/add_question.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/blocks/docling_parse_pdf.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/blocks/json_format.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/flows/detailed_annotation.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/flows/grounded_summary_extraction.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/flows/simple_annotation.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/flows/unstructured_to_structured.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/prompts/keywords.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/prompts/named_entities.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/prompts/sentiment.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/prompts/summary.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/09b5b62d328d3d0719b6825357fdfb48.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/0d631e444d1c22f0be99a69f5deaff94.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1270f7f67f406b52a2ee86584b452bff.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/14f3d2486b21e639a953afb7ad03d90c.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1689b94530eca82b7758c86b4cf3125f.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/171fd9df333ddd814c764843ed624121.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/1949bd0c9c4c23d495d880c4c552bfe1.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/2b626b620ef42f716c6028c74ee4187b.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3877b1983229ec488c6349a188bccf92.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3bc6d3e1c0a117340d288c289bf7f679.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/3e714a49937be1672aa48244ba7254ce.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/6064088db0200b32f3f3e848047c5ab6.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/73c60e60043b8775dac929320839a8c6.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/77423f08f0208d476dea73c639f6293a.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/78cf0d3e40caba622d8914916f0f9146.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/7a29e2dcd505f944b16d1e3173cb1c01.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8c1b4f4af2af2847a240041390e31399.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/8cd753ed00aeee0ed32d03823eef3f7e.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/a24a661c2eb55542903c72391ec09f9b.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b3d7bc295d09d9927e465213612c0192.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b7050f62f52a3d2803beea21404f7af6.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/b9b40b0c1e92fb226067bdceacbdab5c.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c20824ea6f927fe380f48a904cf4821b.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c2bad61ce58687fad602549f6048004b.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/da879f8ea1c23aa6565cccaacac271fc.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/e52e6870e8a04339ef969543fc0f0329.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ecd8e1f1c0fa27dfdd24b358cb65012f.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/f28832481653818f8062a497655fb09e.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts/ff898f396d49760343d08575ea773b54.pdf +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/financial_call_transcripts.jsonl +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/table_manipulation_qna.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/seed_data/unstructured_to_structured_qna.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/structured_summary.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/table_manipulation.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/examples/skills_tuning/instructlab/unstructured_to_structured.ipynb +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/pyproject.toml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/scripts/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/scripts/ruff.sh +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/setup.cfg +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/blocks/block.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/blocks/llmblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/blocks/utilblocks.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/checkpointer.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/cot_reflection.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/detailed_annotations.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/detailed_description.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/annotations/simple_annotations.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/atomic_facts.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/detailed_summary.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/evaluate_question.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/extractive_summary.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/generate_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/generate_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/mcq_generation.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/router.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/reasoning/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/analyzer.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/annotation.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/contexts.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/critic.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/freeform_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/freeform_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/grounded_questions.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/grounded_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/coding.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/math.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/icl_examples/writing.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/judge.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/planner.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/respond.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/revised_responder.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/router.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/skills/improve_responses.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/flows/generation/skills/synth_skills.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/logger_config.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/pipeline.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/registry.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/sdg.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/utils/config_validation.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/utils/datautils.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub/utils/validation_result.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub.egg-info/requires.txt +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/__init__.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/test_llmblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_filterblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_renameblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/blocks/utilblocks/test_settomajority.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/test_flow.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/test_flow_column_validation.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/test_flow_path.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/test_flow_validation.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/testdata/test_config_1.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/testdata/test_flow_1.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/flows/testdata/test_flow_2.yaml +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/test_checkpointer.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/test_pipeline.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/test_sdg.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tests/utils/test_config_validation.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/tox.ini +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/README.md +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/app.py +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/static/css/style.css +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/static/js/app.js +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/templates/index.html +0 -0
- {sdg_hub-0.1.1 → sdg_hub-0.1.2}/web_interface/test_block_types.py +0 -0
@@ -110,7 +110,7 @@ jobs:
|
|
110
110
|
path: dist
|
111
111
|
|
112
112
|
- name: "Sigstore sign package"
|
113
|
-
uses: sigstore/gh-action-sigstore-python@
|
113
|
+
uses: sigstore/gh-action-sigstore-python@f7ad0af51a5648d09a20d00370f0a91c3bdf8f84 # v3.0.1
|
114
114
|
with:
|
115
115
|
inputs: |
|
116
116
|
./dist/*.tar.gz
|
@@ -22,6 +22,184 @@ Blocks are the fundamental processing units in SDG Hub. Each block performs a sp
|
|
22
22
|
|
23
23
|
## LLM Blocks
|
24
24
|
|
25
|
+
### OpenAIChatBlock
|
26
|
+
- **Registered Name**: `OpenAIChatBlock`
|
27
|
+
- **Purpose**: Modern chat completion block using OpenAI Chat Completions API
|
28
|
+
- **Key Features**:
|
29
|
+
- Direct OpenAI message format support (system/user/assistant roles)
|
30
|
+
- All OpenAI Chat Completions API parameters supported
|
31
|
+
- Automatic retry logic for rate limits and API errors
|
32
|
+
- Comprehensive structured logging for monitoring
|
33
|
+
- Works with any OpenAI-compatible endpoint
|
34
|
+
|
35
|
+
**Parameters:**
|
36
|
+
- `block_name: str` - Name of the block
|
37
|
+
- `input_cols: Union[str, List[str]]` - Input column containing messages (must be exactly one)
|
38
|
+
- `output_cols: Union[str, List[str]]` - Output column for responses (must be exactly one)
|
39
|
+
- `client: openai.OpenAI` - OpenAI client instance
|
40
|
+
- `model_id: str` - Model ID to use (e.g., "gpt-4", "gpt-3.5-turbo")
|
41
|
+
- **OpenAI API Parameters** (all optional):
|
42
|
+
- `frequency_penalty: Optional[float]` - Penalize frequent tokens (-2.0 to 2.0)
|
43
|
+
- `logit_bias: Optional[Dict[str, int]]` - Modify likelihood of specified tokens
|
44
|
+
- `logprobs: Optional[bool]` - Whether to return log probabilities
|
45
|
+
- `max_completion_tokens: Optional[int]` - Maximum tokens in completion
|
46
|
+
- `max_tokens: Optional[int]` - Maximum tokens in completion (legacy)
|
47
|
+
- `n: Optional[int]` - Number of completions to generate
|
48
|
+
- `presence_penalty: Optional[float]` - Penalize repeated tokens (-2.0 to 2.0)
|
49
|
+
- `response_format: Optional[Dict[str, Any]]` - Response format (e.g., JSON mode)
|
50
|
+
- `seed: Optional[int]` - Seed for deterministic outputs
|
51
|
+
- `stop: Optional[Union[str, List[str]]]` - Stop sequences
|
52
|
+
- `stream: Optional[bool]` - Whether to stream responses
|
53
|
+
- `temperature: Optional[float]` - Sampling temperature (0.0 to 2.0)
|
54
|
+
- `tool_choice: Optional[Union[str, Dict[str, Any]]]` - Tool selection strategy
|
55
|
+
- `tools: Optional[List[Dict[str, Any]]]` - Available tools for function calling
|
56
|
+
- `top_logprobs: Optional[int]` - Number of top log probabilities to return
|
57
|
+
- `top_p: Optional[float]` - Nucleus sampling parameter (0.0 to 1.0)
|
58
|
+
- `user: Optional[str]` - End-user identifier
|
59
|
+
- `extra_body: Optional[dict]` - Additional parameters for custom endpoints
|
60
|
+
|
61
|
+
**Example Usage:**
|
62
|
+
```yaml
|
63
|
+
- block_type: OpenAIChatBlock
|
64
|
+
block_config:
|
65
|
+
block_name: chat_generator
|
66
|
+
input_cols: messages
|
67
|
+
output_cols: response
|
68
|
+
model_id: gpt-4
|
69
|
+
temperature: 0.7
|
70
|
+
max_tokens: 500
|
71
|
+
```
|
72
|
+
|
73
|
+
**Example with Messages Dataset:**
|
74
|
+
```python
|
75
|
+
import openai
|
76
|
+
from datasets import Dataset
|
77
|
+
from sdg_hub.blocks import OpenAIChatBlock
|
78
|
+
|
79
|
+
# Create client
|
80
|
+
client = openai.OpenAI(api_key="your-api-key")
|
81
|
+
|
82
|
+
# Prepare dataset with messages in OpenAI format
|
83
|
+
messages_data = [
|
84
|
+
[
|
85
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
86
|
+
{"role": "user", "content": "Explain quantum computing in simple terms."}
|
87
|
+
],
|
88
|
+
[
|
89
|
+
{"role": "user", "content": "What is the capital of France?"}
|
90
|
+
]
|
91
|
+
]
|
92
|
+
dataset = Dataset.from_dict({"messages": messages_data})
|
93
|
+
|
94
|
+
# Create and use block
|
95
|
+
block = OpenAIChatBlock(
|
96
|
+
block_name="qa_generator",
|
97
|
+
input_cols="messages",
|
98
|
+
output_cols="response",
|
99
|
+
client=client,
|
100
|
+
model_id="gpt-4",
|
101
|
+
temperature=0.7,
|
102
|
+
max_tokens=150
|
103
|
+
)
|
104
|
+
|
105
|
+
result = block.generate(dataset)
|
106
|
+
print(result["response"])
|
107
|
+
```
|
108
|
+
|
109
|
+
### OpenAIAsyncChatBlock
|
110
|
+
- **Registered Name**: `OpenAIAsyncChatBlock`
|
111
|
+
- **Purpose**: Async version of OpenAIChatBlock for concurrent processing and better performance
|
112
|
+
- **Key Features**:
|
113
|
+
- Concurrent async requests for improved throughput
|
114
|
+
- All features of OpenAIChatBlock
|
115
|
+
- Better performance for large batches
|
116
|
+
- Automatic concurrency management
|
117
|
+
|
118
|
+
**Parameters:**
|
119
|
+
- Same as `OpenAIChatBlock` except:
|
120
|
+
- `async_client: openai.AsyncOpenAI` - Async OpenAI client instance
|
121
|
+
|
122
|
+
**Example Usage:**
|
123
|
+
```yaml
|
124
|
+
- block_type: OpenAIAsyncChatBlock
|
125
|
+
block_config:
|
126
|
+
block_name: async_chat_generator
|
127
|
+
input_cols: messages
|
128
|
+
output_cols: response
|
129
|
+
model_id: gpt-4
|
130
|
+
temperature: 0.7
|
131
|
+
max_tokens: 500
|
132
|
+
```
|
133
|
+
|
134
|
+
**Example with Async Client:**
|
135
|
+
```python
|
136
|
+
import asyncio
|
137
|
+
import openai
|
138
|
+
from datasets import Dataset
|
139
|
+
from sdg_hub.blocks import OpenAIAsyncChatBlock
|
140
|
+
|
141
|
+
# Create async client
|
142
|
+
async_client = openai.AsyncOpenAI(api_key="your-api-key")
|
143
|
+
|
144
|
+
# Same dataset format as sync version
|
145
|
+
messages_data = [
|
146
|
+
[{"role": "user", "content": f"Generate a creative story about topic {i}"}]
|
147
|
+
for i in range(100) # Large batch for demonstration
|
148
|
+
]
|
149
|
+
dataset = Dataset.from_dict({"messages": messages_data})
|
150
|
+
|
151
|
+
# Create and use async block
|
152
|
+
block = OpenAIAsyncChatBlock(
|
153
|
+
block_name="async_story_generator",
|
154
|
+
input_cols="messages",
|
155
|
+
output_cols="story",
|
156
|
+
async_client=async_client,
|
157
|
+
model_id="gpt-4",
|
158
|
+
temperature=0.8,
|
159
|
+
max_tokens=200
|
160
|
+
)
|
161
|
+
|
162
|
+
# Process large batch concurrently
|
163
|
+
result = block.generate(dataset)
|
164
|
+
print(f"Generated {len(result)} stories concurrently")
|
165
|
+
```
|
166
|
+
|
167
|
+
**OpenAI-Compatible Endpoints:**
|
168
|
+
Both blocks work with any OpenAI-compatible endpoint:
|
169
|
+
|
170
|
+
```python
|
171
|
+
# Example with local endpoint
|
172
|
+
client = openai.OpenAI(
|
173
|
+
api_key="not-needed-for-local",
|
174
|
+
base_url="http://localhost:8000/v1"
|
175
|
+
)
|
176
|
+
|
177
|
+
# Example with other providers (Azure, Anthropic, etc.)
|
178
|
+
client = openai.OpenAI(
|
179
|
+
api_key="your-provider-key",
|
180
|
+
base_url="https://your-provider-endpoint.com/v1"
|
181
|
+
)
|
182
|
+
```
|
183
|
+
|
184
|
+
**Monitoring and Logging:**
|
185
|
+
Both blocks provide comprehensive structured logging:
|
186
|
+
- Initialization logs with model and parameters
|
187
|
+
- Generation start/completion logs with batch metrics
|
188
|
+
- Effective parameter tracking (including runtime overrides)
|
189
|
+
- Error tracking and retry information
|
190
|
+
|
191
|
+
Log output example:
|
192
|
+
```
|
193
|
+
INFO: Initialized OpenAIChatBlock 'chat_generator' with model 'gpt-4'
|
194
|
+
{"block_name": "chat_generator", "model_id": "gpt-4", "generation_params": {"temperature": 0.7}}
|
195
|
+
|
196
|
+
INFO: Starting generation for 10 samples
|
197
|
+
{"block_name": "chat_generator", "model_id": "gpt-4", "batch_size": 10, "effective_params": {"temperature": 0.9}}
|
198
|
+
|
199
|
+
INFO: Generation completed successfully for 10 samples
|
200
|
+
{"block_name": "chat_generator", "model_id": "gpt-4", "batch_size": 10}
|
201
|
+
```
|
202
|
+
|
25
203
|
### LLMBlock
|
26
204
|
- **Registered Name**: `LLMBlock`
|
27
205
|
- **Purpose**: Core block for text generation using language models
|
@@ -1,25 +1,25 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
|
3
3
|
# Standard
|
4
|
-
import json
|
5
|
-
import random
|
6
|
-
import uuid
|
7
|
-
import os
|
8
|
-
import yaml
|
9
4
|
from pathlib import Path
|
10
5
|
from typing import List
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
import random
|
11
9
|
import re
|
10
|
+
import uuid
|
12
11
|
|
13
12
|
# Third Party
|
14
|
-
from datasets import Dataset
|
13
|
+
from datasets import Dataset, concatenate_datasets
|
14
|
+
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
|
15
15
|
from tabulate import tabulate
|
16
16
|
from transformers import AutoTokenizer
|
17
|
-
|
17
|
+
import yaml
|
18
18
|
|
19
|
-
#
|
20
|
-
import sdg_hub
|
19
|
+
# First Party
|
21
20
|
from sdg_hub.logger_config import setup_logger
|
22
21
|
from sdg_hub.utils.datautils import safe_concatenate_datasets
|
22
|
+
import sdg_hub
|
23
23
|
|
24
24
|
logger = setup_logger(__name__)
|
25
25
|
_DEFAULT_CHUNK_OVERLAP = 100
|
@@ -98,9 +98,70 @@ def _conv_pretrain(rec):
|
|
98
98
|
return rec
|
99
99
|
|
100
100
|
|
101
|
+
def mask_qa_per_doc(ds: Dataset, keep_no_qa_per_doc: int = 3) -> Dataset:
|
102
|
+
"""
|
103
|
+
Mark QA entries per document for pre-training vs fine-tuning.
|
104
|
+
|
105
|
+
Parameters
|
106
|
+
----------
|
107
|
+
ds : Dataset
|
108
|
+
Input dataset containing documents and QA pairs
|
109
|
+
keep_no_qa_per_doc : int, default=3
|
110
|
+
Number of QA entries per document to mark as unmask (pre-training)
|
111
|
+
|
112
|
+
Returns
|
113
|
+
-------
|
114
|
+
Dataset
|
115
|
+
Dataset with added 'unmask' boolean column indicating pre-training entries
|
116
|
+
"""
|
117
|
+
|
118
|
+
unmask_entries = []
|
119
|
+
mask_entries = []
|
120
|
+
doc_count = {}
|
121
|
+
|
122
|
+
for i, doc in enumerate(ds["document"]):
|
123
|
+
if doc not in doc_count:
|
124
|
+
doc_count[doc] = 1
|
125
|
+
else:
|
126
|
+
doc_count[doc] += 1
|
127
|
+
|
128
|
+
entry = ds[i].copy()
|
129
|
+
if doc_count[doc] <= keep_no_qa_per_doc:
|
130
|
+
entry["unmask"] = True
|
131
|
+
unmask_entries.append(entry)
|
132
|
+
else:
|
133
|
+
entry["unmask"] = False
|
134
|
+
mask_entries.append(entry)
|
135
|
+
|
136
|
+
ds_new = concatenate_datasets(
|
137
|
+
[Dataset.from_list(unmask_entries), Dataset.from_list(mask_entries)]
|
138
|
+
)
|
139
|
+
return ds_new
|
140
|
+
|
141
|
+
|
101
142
|
def generate_knowledge_qa_dataset(
|
102
|
-
generated_dataset: Dataset,
|
143
|
+
generated_dataset: Dataset,
|
144
|
+
keep_context_separate: bool = False,
|
145
|
+
keep_document_outline: bool = False,
|
146
|
+
keep_columns: List[str] = None,
|
147
|
+
filter_non_pre_training: bool = True,
|
148
|
+
keep_no_qa_per_doc: int = 3,
|
103
149
|
):
|
150
|
+
generated_dataset = generated_dataset.map(
|
151
|
+
lambda x: {
|
152
|
+
"response": x["response"]
|
153
|
+
.replace("[END]", "")
|
154
|
+
.replace("[ANSWER]", "")
|
155
|
+
.strip()
|
156
|
+
},
|
157
|
+
num_proc=10,
|
158
|
+
)
|
159
|
+
generated_dataset = mask_qa_per_doc(
|
160
|
+
generated_dataset, keep_no_qa_per_doc=keep_no_qa_per_doc
|
161
|
+
)
|
162
|
+
if filter_non_pre_training:
|
163
|
+
generated_dataset = generated_dataset.filter(lambda x: x["unmask"])
|
164
|
+
|
104
165
|
def __create_qa_row(rec):
|
105
166
|
context = rec["document"]
|
106
167
|
instruction = rec["question"]
|
@@ -146,7 +207,12 @@ def generate_knowledge_qa_dataset(
|
|
146
207
|
return {"messages": messages, "metadata": metadata, "id": str(uuid.uuid4())}
|
147
208
|
|
148
209
|
knowledge_ds = generated_dataset.map(
|
149
|
-
__create_qa_row,
|
210
|
+
__create_qa_row,
|
211
|
+
remove_columns=[
|
212
|
+
e
|
213
|
+
for e in generated_dataset.column_names
|
214
|
+
if e not in keep_columns + ["unmask"]
|
215
|
+
],
|
150
216
|
)
|
151
217
|
return knowledge_ds
|
152
218
|
|
@@ -6,6 +6,10 @@ This package provides various block implementations for data generation, process
|
|
6
6
|
# Local
|
7
7
|
from .block import Block
|
8
8
|
from .llmblock import LLMBlock, ConditionalLLMBlock
|
9
|
+
from .openaichatblock import (
|
10
|
+
OpenAIChatBlock,
|
11
|
+
OpenAIAsyncChatBlock
|
12
|
+
)
|
9
13
|
from .utilblocks import (
|
10
14
|
SamplePopulatorBlock,
|
11
15
|
SelectorBlock,
|
@@ -33,4 +37,6 @@ __all__ = [
|
|
33
37
|
"RenameColumns",
|
34
38
|
"SetToMajorityValue",
|
35
39
|
"BlockRegistry",
|
40
|
+
"OpenAIChatBlock",
|
41
|
+
"OpenAIAsyncChatBlock"
|
36
42
|
]
|