sdg-hub 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/integration-test.yml +48 -34
- sdg_hub-0.5.0/.github/workflows/packer.yml +33 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/test.yml +0 -13
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/CLAUDE.md +0 -7
- {sdg_hub-0.4.2/src/sdg_hub.egg-info → sdg_hub-0.5.0}/PKG-INFO +2 -2
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +2 -9
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/pyproject.toml +1 -1
- sdg_hub-0.5.0/scripts/packer/centos.pkr.hcl +52 -0
- sdg_hub-0.5.0/scripts/packer/setup-centos.sh +80 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/__init__.py +0 -22
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/base.py +8 -80
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0/src/sdg_hub.egg-info}/PKG-INFO +2 -2
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/SOURCES.txt +3 -21
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/requires.txt +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_json_structure_block.py +1 -1
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_renameblock.py → sdg_hub-0.5.0/tests/blocks/transform/test_rename_columns.py +19 -19
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_uniform_col_val_setter.py +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +73 -3
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tox.ini +2 -2
- sdg_hub-0.4.2/.github/workflows/e2e.yml +0 -103
- sdg_hub-0.4.2/.github/workflows/packer.yml +0 -15
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/flow/migration.py +0 -198
- sdg_hub-0.4.2/tests/blocks/deprecated/test_llmblock.py +0 -148
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_combinecolumns.py +0 -168
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -112
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_flattenblock.py +0 -217
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -37
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_selectorblock.py +0 -144
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_settomajority.py +0 -127
- sdg_hub-0.4.2/tests/flow/test_migration.py +0 -449
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/dependabot.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/mergify.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.github/workflows/pypi.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.gitignore +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.isort.cfg +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/.pylintrc +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/LICENSE +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/Makefile +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/.nojekyll +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_coverpage.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_navbar.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/_sidebar.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/api-reference.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/llm-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/concepts.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/development.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/flows/overview.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/index.html +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/installation.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/docs/quick-start.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/annotation_classification.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/knowledge_tuning/knowledge_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/scripts/ruff.sh +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/setup.cfg +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/time_estimator.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_parser_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_base.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_integration.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_time_estimation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_flow_metrics.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.0}/tests/utils/test_path_resolution.py +0 -0
@@ -7,29 +7,11 @@ on:
|
|
7
7
|
branches:
|
8
8
|
- "main"
|
9
9
|
- "release-**"
|
10
|
-
paths:
|
11
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
12
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
13
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
14
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
15
|
-
- 'tests/integration/**/*.py'
|
16
|
-
- 'pyproject.toml'
|
17
|
-
- 'tox.ini'
|
18
|
-
- '.github/workflows/integration-test.yml'
|
19
10
|
pull_request:
|
20
11
|
branches:
|
21
12
|
- "main"
|
22
13
|
- "release-**"
|
23
14
|
types: [opened, synchronize, reopened, labeled]
|
24
|
-
paths:
|
25
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
26
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
27
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
28
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
29
|
-
- 'tests/integration/**/*.py'
|
30
|
-
- 'pyproject.toml'
|
31
|
-
- 'tox.ini'
|
32
|
-
- '.github/workflows/integration-test.yml'
|
33
15
|
|
34
16
|
env:
|
35
17
|
LC_ALL: en_US.UTF-8
|
@@ -42,19 +24,58 @@ permissions:
|
|
42
24
|
contents: read
|
43
25
|
|
44
26
|
jobs:
|
27
|
+
check-trigger:
|
28
|
+
name: "Check If Integration Should Run"
|
29
|
+
runs-on: ubuntu-latest
|
30
|
+
outputs:
|
31
|
+
should_run: ${{ steps.check.outputs.should_run }}
|
32
|
+
steps:
|
33
|
+
- uses: actions/checkout@v4
|
34
|
+
|
35
|
+
- uses: dorny/paths-filter@v3
|
36
|
+
id: filter
|
37
|
+
if: github.event_name == 'pull_request'
|
38
|
+
with:
|
39
|
+
filters: |
|
40
|
+
relevant:
|
41
|
+
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
42
|
+
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
43
|
+
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
44
|
+
# Standard integration test triggers, DONT CHANGE THIS
|
45
|
+
- 'tests/integration/**/*.py'
|
46
|
+
- 'pyproject.toml'
|
47
|
+
- 'tox.ini'
|
48
|
+
- '.github/workflows/integration-test.yml'
|
49
|
+
|
50
|
+
- name: Determine if tests should run
|
51
|
+
id: check
|
52
|
+
run: |
|
53
|
+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || [[ "${{ github.event_name }}" == "push" ]]; then
|
54
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
55
|
+
elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
56
|
+
# Check if from fork
|
57
|
+
if [[ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
|
58
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
59
|
+
# Check if labeled event with correct label
|
60
|
+
elif [[ "${{ github.event.action }}" == "labeled" ]] && [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-integration-tests') }}" == "true" ]]; then
|
61
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
62
|
+
# Check if relevant paths changed for non-labeled events
|
63
|
+
elif [[ "${{ github.event.action }}" != "labeled" ]] && [[ "${{ steps.filter.outputs.relevant }}" == "true" ]]; then
|
64
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
65
|
+
else
|
66
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
67
|
+
fi
|
68
|
+
else
|
69
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
70
|
+
fi
|
71
|
+
|
45
72
|
integration-test:
|
46
73
|
name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
|
47
74
|
runs-on: "${{ matrix.platform }}"
|
75
|
+
needs: check-trigger
|
76
|
+
if: needs.check-trigger.outputs.should_run == 'true'
|
48
77
|
# Require manual approval before running (via GitHub Environment)
|
49
78
|
environment: integration-tests
|
50
|
-
# Skip fork PRs (they can't access environment secrets anyway)
|
51
|
-
# Also check for 'run-integration-tests' label on labeled events
|
52
|
-
if: |
|
53
|
-
github.event_name == 'workflow_dispatch' ||
|
54
|
-
github.event_name == 'push' ||
|
55
|
-
(github.event_name == 'pull_request' &&
|
56
|
-
github.event.pull_request.head.repo.full_name == github.repository &&
|
57
|
-
(github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
|
58
79
|
strategy:
|
59
80
|
matrix:
|
60
81
|
python:
|
@@ -89,12 +110,9 @@ jobs:
|
|
89
110
|
**/pyproject.toml
|
90
111
|
**/requirements*.txt
|
91
112
|
|
92
|
-
- name: Remove llama-cpp-python from cache
|
93
|
-
run: |
|
94
|
-
pip cache remove llama_cpp_python
|
95
113
|
|
96
114
|
- name: Cache huggingface datasets
|
97
|
-
uses: actions/cache@
|
115
|
+
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
|
98
116
|
with:
|
99
117
|
path: ~/.cache/huggingface
|
100
118
|
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
@@ -111,10 +129,6 @@ jobs:
|
|
111
129
|
run: |
|
112
130
|
tox -e py3-integrationcov
|
113
131
|
|
114
|
-
- name: Remove llama-cpp-python from cache
|
115
|
-
if: always()
|
116
|
-
run: |
|
117
|
-
pip cache remove llama_cpp_python
|
118
132
|
|
119
133
|
- name: Upload integration test coverage to Codecov
|
120
134
|
uses: codecov/codecov-action@v4
|
@@ -0,0 +1,33 @@
|
|
1
|
+
name: Build AMI with Packer
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_dispatch:
|
5
|
+
|
6
|
+
jobs:
|
7
|
+
build-ami:
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
permissions:
|
10
|
+
id-token: write # This is required for OIDC
|
11
|
+
contents: read
|
12
|
+
|
13
|
+
steps:
|
14
|
+
- name: Checkout repository
|
15
|
+
uses: actions/checkout@v4
|
16
|
+
|
17
|
+
- name: Configure AWS Credentials
|
18
|
+
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c
|
19
|
+
with:
|
20
|
+
role-to-assume: arn:aws:iam::851725220677:role/github-actions-packer-role
|
21
|
+
aws-region: us-east-2
|
22
|
+
role-session-name: github-actions-packer # For tracking in CloudTrail
|
23
|
+
|
24
|
+
- name: Setup Packer
|
25
|
+
uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232
|
26
|
+
|
27
|
+
- name: Build and create AMI
|
28
|
+
run: |
|
29
|
+
set -euo pipefail
|
30
|
+
cd scripts/packer
|
31
|
+
packer init .
|
32
|
+
packer validate .
|
33
|
+
packer build .
|
@@ -86,16 +86,7 @@ jobs:
|
|
86
86
|
**/pyproject.toml
|
87
87
|
**/requirements*.txt
|
88
88
|
|
89
|
-
- name: Remove llama-cpp-python from cache
|
90
|
-
run: |
|
91
|
-
pip cache remove llama_cpp_python
|
92
89
|
|
93
|
-
- name: Cache huggingface
|
94
|
-
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
95
|
-
with:
|
96
|
-
path: ~/.cache/huggingface
|
97
|
-
# config contains DEFAULT_MODEL
|
98
|
-
key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
|
99
90
|
|
100
91
|
- name: Install dependencies
|
101
92
|
run: |
|
@@ -107,10 +98,6 @@ jobs:
|
|
107
98
|
tox -e py3-unitcov
|
108
99
|
|
109
100
|
|
110
|
-
- name: Remove llama-cpp-python from cache
|
111
|
-
if: always()
|
112
|
-
run: |
|
113
|
-
pip cache remove llama_cpp_python
|
114
101
|
|
115
102
|
- name: Upload coverage to Codecov
|
116
103
|
uses: codecov/codecov-action@v4
|
@@ -86,7 +86,6 @@ The framework is built around a modular block system with **composability at its
|
|
86
86
|
- `transform/`: Data transformation blocks (column operations, text manipulation)
|
87
87
|
- `filtering/`: Data filtering blocks with quality thresholds
|
88
88
|
- `evaluation/`: Quality evaluation blocks (faithfulness, relevancy assessment)
|
89
|
-
- `deprecated_blocks/`: Legacy blocks maintained for backward compatibility
|
90
89
|
|
91
90
|
**Key Benefits**: Type-safe composition, automatic validation, rich logging, and high-performance async processing.
|
92
91
|
|
@@ -97,7 +96,6 @@ Flows orchestrate multiple blocks into data processing pipelines:
|
|
97
96
|
- **FlowRegistry** (`src/sdg_hub/core/flow/registry.py`): Registry for flow discovery
|
98
97
|
- **FlowMetadata** (`src/sdg_hub/core/flow/metadata.py`): Metadata and parameter definitions
|
99
98
|
- **FlowValidator** (`src/sdg_hub/core/flow/validation.py`): YAML structure validation
|
100
|
-
- **FlowMigration** (`src/sdg_hub/core/flow/migration.py`): Backward compatibility for old flow formats
|
101
99
|
|
102
100
|
### Flow Configuration
|
103
101
|
Flows are defined in YAML files with this structure:
|
@@ -148,11 +146,6 @@ All blocks operate on HuggingFace `datasets.Dataset` objects:
|
|
148
146
|
- Rich logging provides processing summaries
|
149
147
|
- Empty dataset handling with appropriate errors
|
150
148
|
|
151
|
-
### Backward Compatibility
|
152
|
-
The framework maintains compatibility with legacy formats:
|
153
|
-
- Deprecated blocks are preserved in `deprecated_blocks/`
|
154
|
-
- Flow migration automatically converts old YAML formats
|
155
|
-
- Legacy LLMBlocks receive special handling during execution
|
156
149
|
|
157
150
|
## Testing Guidelines
|
158
151
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sdg_hub
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Synthetic Data Generation
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
6
|
License: Apache-2.0
|
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
|
|
23
23
|
Description-Content-Type: text/markdown
|
24
24
|
License-File: LICENSE
|
25
25
|
Requires-Dist: click<9.0.0,>=8.1.7
|
26
|
-
Requires-Dist: datasets
|
26
|
+
Requires-Dist: datasets>=4.0.0
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
28
28
|
Requires-Dist: jinja2
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
@@ -359,7 +359,7 @@
|
|
359
359
|
"processed_knowledge_dataset = processed_knowledge_dataset.remove_columns(['messages']).rename_column('messages_without_think', 'messages')\n",
|
360
360
|
"\n",
|
361
361
|
"cfg = RAFTConfig(k_passages=5, max_tokens_per_chunk=400, p_include_oracle=0.9)\n",
|
362
|
-
"raft_samples = build_raft_samples(
|
362
|
+
"raft_samples = build_raft_samples(processed_knowledge_dataset, cfg)\n",
|
363
363
|
"raft_samples = raft_samples.map(build_messages).remove_columns(['question', 'context', 'oracle_context', 'cot_answer', 'answer', 'instruction', 'type', 'meta'])\n",
|
364
364
|
"\n",
|
365
365
|
"fp = \"<Instruction/Skills dataset>\" # TODO: Replace with huggingface dataset path once its uploaded\n",
|
@@ -237,20 +237,13 @@ def build_messages(raft_record: Dict[str, Any]):
|
|
237
237
|
Output:
|
238
238
|
messages: list of {"role": "system"|"user"|"assistant", "content": str}
|
239
239
|
"""
|
240
|
-
# 1.
|
241
|
-
sys_msg = raft_record.get("instruction") or (
|
242
|
-
"You are a domain expert. You must answer questions by first quoting a span "
|
243
|
-
"verbatim from the relevant passage, then giving reasoning, then the final answer. "
|
244
|
-
"Ignore distractor passages."
|
245
|
-
)
|
246
|
-
|
247
|
-
# 2. User message: serialize passages + question
|
240
|
+
# 1. User message: serialize passages + question
|
248
241
|
passages = "\n\n".join(
|
249
242
|
[f"[Passage {i+1}] {p}" for i, p in enumerate(raft_record["context"])]
|
250
243
|
)
|
251
244
|
user_msg = f"Passages:\n{passages}\n\nQuestion: {raft_record['question']}"
|
252
245
|
|
253
|
-
#
|
246
|
+
# 2. Assistant message: the gold output
|
254
247
|
assistant_msg = raft_record["answer"]
|
255
248
|
|
256
249
|
return {"messages" : [
|
@@ -0,0 +1,52 @@
|
|
1
|
+
packer {
|
2
|
+
required_plugins {
|
3
|
+
amazon = {
|
4
|
+
version = ">= 1.2.8"
|
5
|
+
source = "github.com/hashicorp/amazon"
|
6
|
+
}
|
7
|
+
}
|
8
|
+
}
|
9
|
+
|
10
|
+
variable "github_sha" {
|
11
|
+
type = string
|
12
|
+
description = "GitHub commit SHA to tag the AMI with"
|
13
|
+
default = env("GITHUB_SHA")
|
14
|
+
}
|
15
|
+
|
16
|
+
variable "github_repository" {
|
17
|
+
type = string
|
18
|
+
description = "GitHub repository name to tag the AMI with"
|
19
|
+
default = env("GITHUB_REPOSITORY")
|
20
|
+
}
|
21
|
+
|
22
|
+
source "amazon-ebs" "centos" {
|
23
|
+
ami_name = "github-actions-centos-nvidia-ami-{{timestamp}}"
|
24
|
+
# Use the lowest-cost instance type that can efficiently build and santity-check the driver.
|
25
|
+
# It should be old enough to be low-cost, but new enough to be compatible with our desired driver version.
|
26
|
+
instance_type = "g6.xlarge"
|
27
|
+
region = "us-east-2"
|
28
|
+
source_ami_filter {
|
29
|
+
filters = {
|
30
|
+
name = "CentOS Stream 9 x86_64*"
|
31
|
+
root-device-type = "ebs"
|
32
|
+
virtualization-type = "hvm"
|
33
|
+
}
|
34
|
+
most_recent = true
|
35
|
+
owners = ["125523088429"] # CentOS CPE team ID.
|
36
|
+
}
|
37
|
+
ssh_username = "ec2-user"
|
38
|
+
tags = {
|
39
|
+
Name = "CentOS Stream 9 with Nvidia Drivers"
|
40
|
+
BuiltBy = "Packer"
|
41
|
+
GitHubCommitSHA = var.github_sha
|
42
|
+
GitHubRepository = var.github_repository
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
build {
|
47
|
+
sources = ["source.amazon-ebs.centos"]
|
48
|
+
provisioner "shell" {
|
49
|
+
script = "./setup-centos.sh"
|
50
|
+
execute_command = "sudo bash {{.Path}}"
|
51
|
+
}
|
52
|
+
}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Setup script for CentOS GitHub Actions AMI
|
3
|
+
# Derived from:
|
4
|
+
# github.com/containers/ai-lab-recipes/blob/main/training/nvidia-bootc/Containerfile
|
5
|
+
|
6
|
+
set -euxo pipefail
|
7
|
+
|
8
|
+
DRIVER_VERSION="580.65.06"
|
9
|
+
# CUDA_VERSION is embedded in the driver "local repo" package
|
10
|
+
|
11
|
+
if [[ $(id -u) != "0" ]]; then
|
12
|
+
echo "you must run this script as root."
|
13
|
+
exit 1
|
14
|
+
fi
|
15
|
+
|
16
|
+
function configure_dnf {
|
17
|
+
# Configure the DNF repos and options we need for CI.
|
18
|
+
dnf -y install dnf-plugins-core
|
19
|
+
dnf config-manager --save \
|
20
|
+
--setopt=skip_missing_names_on_install=False \
|
21
|
+
--setopt=install_weak_deps=False
|
22
|
+
|
23
|
+
dnf -y install epel-release
|
24
|
+
dnf -y install https://us.download.nvidia.com/tesla/$DRIVER_VERSION/nvidia-driver-local-repo-rhel9-$DRIVER_VERSION-1.0-1.x86_64.rpm
|
25
|
+
# TODO: We might be able to use a nvidia.com yum repo instead of the local repo?
|
26
|
+
# dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel${OS_VERSION_MAJOR}/${CUDA_REPO_ARCH}/cuda-rhel${OS_VERSION_MAJOR}.repo
|
27
|
+
}
|
28
|
+
|
29
|
+
function install_userland_packages {
|
30
|
+
# CI tests in GH Actions will require these packages:
|
31
|
+
dnf -y install nvtop podman skopeo git python3.12 python3.12-devel
|
32
|
+
}
|
33
|
+
|
34
|
+
function install_kernel_driver {
|
35
|
+
# Install nvidia kernel driver.
|
36
|
+
# DKMS will compile the nvidia.ko driver for all kernels for which we have installed a kernel-devel package.
|
37
|
+
# By default, the "dnf module install" command will install the latest kernel-devel package that CentOS has published.
|
38
|
+
dnf -y install "kernel-devel-$(uname -r)" gcc make dkms elfutils-libelf-devel # also build for the currently-running kernel.
|
39
|
+
# If we had configured a previous nvidia-driver version with DNF, reset it:
|
40
|
+
dnf -y module reset nvidia-driver || true
|
41
|
+
DRIVER_STREAM=$(echo $DRIVER_VERSION | cut -d. -f1)
|
42
|
+
dnf -y module install nvidia-driver:${DRIVER_STREAM}-dkms # or use :latest-dkms after confirming available streams
|
43
|
+
}
|
44
|
+
|
45
|
+
function test_kernel_driver {
|
46
|
+
# The nvidia driver DNF module (above) installs a dkms RPM.
|
47
|
+
# That dkms RPM compiles and installs the nvidia.ko module.
|
48
|
+
# List all the modules that dkms has compiled:
|
49
|
+
dkms status || true
|
50
|
+
# Load the module (ok if it’s already loaded or unavailable for this kernel):
|
51
|
+
modprobe -q nvidia || true
|
52
|
+
# If a GPU is present, verify userspace; otherwise, fail the job:
|
53
|
+
nvidia-smi
|
54
|
+
}
|
55
|
+
|
56
|
+
function install_container_toolkit {
|
57
|
+
# Install nvidia container toolkit.
|
58
|
+
# When we pass GPU devices to a container (podman run --device nvidia.com/gpu=all), we use the nvidia CTK to do that.
|
59
|
+
# See docs at https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
|
60
|
+
curl -sSfL -o /etc/yum.repos.d/nvidia-container-toolkit.repo \
|
61
|
+
https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
|
62
|
+
dnf config-manager --enable nvidia-container-toolkit-experimental
|
63
|
+
export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
|
64
|
+
dnf install -y \
|
65
|
+
nvidia-container-toolkit-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
66
|
+
nvidia-container-toolkit-base-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
67
|
+
libnvidia-container-tools-${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
68
|
+
libnvidia-container1-${NVIDIA_CONTAINER_TOOLKIT_VERSION}
|
69
|
+
# Verify it works:
|
70
|
+
nvidia-ctk --version
|
71
|
+
# When you boot a node, you must run:
|
72
|
+
# sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
|
73
|
+
# This command scans your system for NVIDIA GPUs and creates a YAML file that lists the available devices.
|
74
|
+
}
|
75
|
+
|
76
|
+
configure_dnf
|
77
|
+
install_userland_packages
|
78
|
+
install_kernel_driver
|
79
|
+
test_kernel_driver
|
80
|
+
install_container_toolkit
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
28
28
|
commit_id: COMMIT_ID
|
29
29
|
__commit_id__: COMMIT_ID
|
30
30
|
|
31
|
-
__version__ = version = '0.
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
31
|
+
__version__ = version = '0.5.0'
|
32
|
+
__version_tuple__ = version_tuple = (0, 5, 0)
|
33
33
|
|
34
|
-
__commit_id__ = commit_id = '
|
34
|
+
__commit_id__ = commit_id = 'ge1e260984'
|
@@ -5,17 +5,6 @@ This package provides various block implementations for data generation, process
|
|
5
5
|
|
6
6
|
# Local
|
7
7
|
from .base import BaseBlock
|
8
|
-
from .deprecated_blocks import (
|
9
|
-
CombineColumnsBlock,
|
10
|
-
DuplicateColumns,
|
11
|
-
FilterByValueBlock,
|
12
|
-
FlattenColumnsBlock,
|
13
|
-
LLMBlock,
|
14
|
-
RenameColumns,
|
15
|
-
SamplePopulatorBlock,
|
16
|
-
SelectorBlock,
|
17
|
-
SetToMajorityValue,
|
18
|
-
)
|
19
8
|
from .filtering import ColumnValueFilterBlock
|
20
9
|
from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
|
21
10
|
from .registry import BlockRegistry
|
@@ -28,8 +17,6 @@ from .transform import (
|
|
28
17
|
UniformColumnValueSetter,
|
29
18
|
)
|
30
19
|
|
31
|
-
# All blocks moved to deprecated_blocks or transform modules
|
32
|
-
|
33
20
|
__all__ = [
|
34
21
|
"BaseBlock",
|
35
22
|
"BlockRegistry",
|
@@ -40,15 +27,6 @@ __all__ = [
|
|
40
27
|
"RenameColumnsBlock",
|
41
28
|
"TextConcatBlock",
|
42
29
|
"UniformColumnValueSetter",
|
43
|
-
"CombineColumnsBlock", # Deprecated
|
44
|
-
"DuplicateColumns", # Deprecated
|
45
|
-
"FilterByValueBlock", # Deprecated
|
46
|
-
"FlattenColumnsBlock", # Deprecated
|
47
|
-
"RenameColumns", # Deprecated
|
48
|
-
"SamplePopulatorBlock", # Deprecated
|
49
|
-
"SelectorBlock", # Deprecated
|
50
|
-
"SetToMajorityValue", # Deprecated
|
51
|
-
"LLMBlock", # Deprecated
|
52
30
|
"LLMChatBlock",
|
53
31
|
"LLMParserBlock",
|
54
32
|
"TextParserBlock",
|
@@ -64,6 +64,25 @@ class RenameColumnsBlock(BaseBlock):
|
|
64
64
|
-------
|
65
65
|
Dataset
|
66
66
|
Dataset with renamed columns.
|
67
|
+
|
68
|
+
Raises
|
69
|
+
------
|
70
|
+
ValueError
|
71
|
+
If attempting to rename to a column name that already exists.
|
67
72
|
"""
|
73
|
+
# Check for column name collisions
|
74
|
+
# Strict validation: no target column name can be an existing column name
|
75
|
+
# This prevents chained/circular renames which can be confusing
|
76
|
+
existing_cols = set(samples.column_names)
|
77
|
+
target_cols = set(self.input_cols.values())
|
78
|
+
|
79
|
+
collision = target_cols & existing_cols
|
80
|
+
if collision:
|
81
|
+
raise ValueError(
|
82
|
+
f"Cannot rename to existing column names: {sorted(collision)}. "
|
83
|
+
"Target column names must not already exist in the dataset. "
|
84
|
+
"Chained renames are not supported."
|
85
|
+
)
|
86
|
+
|
68
87
|
# Rename columns using HuggingFace datasets method
|
69
88
|
return samples.rename_columns(self.input_cols)
|
@@ -41,7 +41,6 @@ from ..utils.time_estimator import estimate_execution_time
|
|
41
41
|
from ..utils.yaml_utils import save_flow_yaml
|
42
42
|
from .checkpointer import FlowCheckpointer
|
43
43
|
from .metadata import DatasetRequirements, FlowMetadata
|
44
|
-
from .migration import FlowMigration
|
45
44
|
from .validation import FlowValidator
|
46
45
|
|
47
46
|
logger = setup_logger(__name__)
|
@@ -73,8 +72,6 @@ class Flow(BaseModel):
|
|
73
72
|
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
74
73
|
|
75
74
|
# Private attributes (not serialized)
|
76
|
-
_migrated_runtime_params: dict[str, dict[str, Any]] = {}
|
77
|
-
_llm_client: Any = None # Only used for backward compatibility with old YAMLs
|
78
75
|
_model_config_set: bool = False # Track if model configuration has been set
|
79
76
|
_block_metrics: list[dict[str, Any]] = PrivateAttr(
|
80
77
|
default_factory=list
|
@@ -113,16 +110,13 @@ class Flow(BaseModel):
|
|
113
110
|
return self
|
114
111
|
|
115
112
|
@classmethod
|
116
|
-
def from_yaml(cls, yaml_path: str
|
113
|
+
def from_yaml(cls, yaml_path: str) -> "Flow":
|
117
114
|
"""Load flow from YAML configuration file.
|
118
115
|
|
119
116
|
Parameters
|
120
117
|
----------
|
121
118
|
yaml_path : str
|
122
119
|
Path to the YAML flow configuration file.
|
123
|
-
client : Any, optional
|
124
|
-
LLM client instance. Required for backward compatibility with old format YAMLs
|
125
|
-
that use deprecated LLMBlocks. Ignored for new format YAMLs.
|
126
120
|
|
127
121
|
Returns
|
128
122
|
-------
|
@@ -153,21 +147,6 @@ class Flow(BaseModel):
|
|
153
147
|
except yaml.YAMLError as exc:
|
154
148
|
raise FlowValidationError(f"Invalid YAML in {yaml_path}: {exc}") from exc
|
155
149
|
|
156
|
-
# Check if this is an old format flow and migrate if necessary
|
157
|
-
migrated_runtime_params = None
|
158
|
-
is_old_format = FlowMigration.is_old_format(flow_config)
|
159
|
-
if is_old_format:
|
160
|
-
logger.info(f"Detected old format flow, migrating: {yaml_path}")
|
161
|
-
if client is None:
|
162
|
-
logger.warning(
|
163
|
-
"Old format YAML detected but no client provided. LLMBlocks may fail."
|
164
|
-
)
|
165
|
-
flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
|
166
|
-
flow_config, yaml_path
|
167
|
-
)
|
168
|
-
# Save migrated config back to YAML to persist id
|
169
|
-
save_flow_yaml(yaml_path, flow_config, "migrated to new format")
|
170
|
-
|
171
150
|
# Validate YAML structure
|
172
151
|
validator = FlowValidator()
|
173
152
|
validation_errors = validator.validate_yaml_structure(flow_config)
|
@@ -194,19 +173,6 @@ class Flow(BaseModel):
|
|
194
173
|
|
195
174
|
for i, block_config in enumerate(block_configs):
|
196
175
|
try:
|
197
|
-
# Inject client for deprecated LLMBlocks if this is an old format flow
|
198
|
-
if (
|
199
|
-
is_old_format
|
200
|
-
and block_config.get("block_type") == "LLMBlock"
|
201
|
-
and client is not None
|
202
|
-
):
|
203
|
-
if "block_config" not in block_config:
|
204
|
-
block_config["block_config"] = {}
|
205
|
-
block_config["block_config"]["client"] = client
|
206
|
-
logger.debug(
|
207
|
-
f"Injected client for deprecated LLMBlock: {block_config['block_config'].get('block_name')}"
|
208
|
-
)
|
209
|
-
|
210
176
|
block = cls._create_block_from_config(block_config, yaml_dir)
|
211
177
|
blocks.append(block)
|
212
178
|
except Exception as exc:
|
@@ -228,12 +194,6 @@ class Flow(BaseModel):
|
|
228
194
|
)
|
229
195
|
else:
|
230
196
|
logger.debug(f"Flow already had id: {flow.metadata.id}")
|
231
|
-
# Store migrated runtime params and client for backward compatibility
|
232
|
-
if migrated_runtime_params:
|
233
|
-
flow._migrated_runtime_params = migrated_runtime_params
|
234
|
-
if is_old_format and client is not None:
|
235
|
-
flow._llm_client = client
|
236
|
-
|
237
197
|
# Check if this is a flow without LLM blocks
|
238
198
|
llm_blocks = flow._detect_llm_blocks()
|
239
199
|
if not llm_blocks:
|
@@ -484,12 +444,6 @@ class Flow(BaseModel):
|
|
484
444
|
self._block_metrics = []
|
485
445
|
run_start = time.perf_counter()
|
486
446
|
|
487
|
-
# Merge migrated runtime params with provided ones (provided ones take precedence)
|
488
|
-
merged_runtime_params = self._migrated_runtime_params.copy()
|
489
|
-
if runtime_params:
|
490
|
-
merged_runtime_params.update(runtime_params)
|
491
|
-
runtime_params = merged_runtime_params
|
492
|
-
|
493
447
|
# Execute flow with metrics capture, ensuring metrics are always displayed/saved
|
494
448
|
final_dataset = None
|
495
449
|
execution_successful = False
|
@@ -647,22 +601,8 @@ class Flow(BaseModel):
|
|
647
601
|
input_cols = set(current_dataset.column_names)
|
648
602
|
|
649
603
|
try:
|
650
|
-
#
|
651
|
-
|
652
|
-
hasattr(block, "__class__")
|
653
|
-
and hasattr(block.__class__, "__module__")
|
654
|
-
and "deprecated_blocks" in block.__class__.__module__
|
655
|
-
)
|
656
|
-
|
657
|
-
if is_deprecated_block:
|
658
|
-
exec_logger.debug(
|
659
|
-
f"Skipping validations for deprecated block: {block.block_name}"
|
660
|
-
)
|
661
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
662
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
663
|
-
else:
|
664
|
-
# Execute block with validation and logging
|
665
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
604
|
+
# Execute block with validation and logging
|
605
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
666
606
|
|
667
607
|
# Validate output
|
668
608
|
if len(current_dataset) == 0:
|
@@ -724,9 +664,11 @@ class Flow(BaseModel):
|
|
724
664
|
return current_dataset
|
725
665
|
|
726
666
|
def _prepare_block_kwargs(
|
727
|
-
self, block: BaseBlock, runtime_params: dict[str, dict[str, Any]]
|
667
|
+
self, block: BaseBlock, runtime_params: Optional[dict[str, dict[str, Any]]]
|
728
668
|
) -> dict[str, Any]:
|
729
669
|
"""Prepare execution parameters for a block."""
|
670
|
+
if runtime_params is None:
|
671
|
+
return {}
|
730
672
|
return runtime_params.get(block.block_name, {})
|
731
673
|
|
732
674
|
def set_model_config(
|
@@ -1114,22 +1056,8 @@ class Flow(BaseModel):
|
|
1114
1056
|
if max_concurrency is not None:
|
1115
1057
|
block_kwargs["_flow_max_concurrency"] = max_concurrency
|
1116
1058
|
|
1117
|
-
#
|
1118
|
-
|
1119
|
-
hasattr(block, "__class__")
|
1120
|
-
and hasattr(block.__class__, "__module__")
|
1121
|
-
and "deprecated_blocks" in block.__class__.__module__
|
1122
|
-
)
|
1123
|
-
|
1124
|
-
if is_deprecated_block:
|
1125
|
-
logger.debug(
|
1126
|
-
f"Dry run: Skipping validations for deprecated block: {block.block_name}"
|
1127
|
-
)
|
1128
|
-
# Call generate() directly to skip validations, but keep the runtime params
|
1129
|
-
current_dataset = block.generate(current_dataset, **block_kwargs)
|
1130
|
-
else:
|
1131
|
-
# Execute block with validation and logging
|
1132
|
-
current_dataset = block(current_dataset, **block_kwargs)
|
1059
|
+
# Execute block with validation and logging
|
1060
|
+
current_dataset = block(current_dataset, **block_kwargs)
|
1133
1061
|
|
1134
1062
|
block_execution_time = (
|
1135
1063
|
time.perf_counter() - block_start_time
|
@@ -77,9 +77,13 @@ blocks:
|
|
77
77
|
- ''
|
78
78
|
- block_type: RenameColumnsBlock
|
79
79
|
block_config:
|
80
|
-
block_name:
|
80
|
+
block_name: rename_to_raw_document_column
|
81
81
|
input_cols:
|
82
82
|
document: raw_document
|
83
|
+
- block_type: RenameColumnsBlock
|
84
|
+
block_config:
|
85
|
+
block_name: rename_to_document_column
|
86
|
+
input_cols:
|
83
87
|
summary: document
|
84
88
|
- block_type: PromptBuilderBlock
|
85
89
|
block_config:
|