sdg-hub 0.4.2__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/integration-test.yml +48 -34
- sdg_hub-0.5.1/.github/workflows/packer.yml +33 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/test.yml +0 -13
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/CLAUDE.md +0 -7
- {sdg_hub-0.4.2/src/sdg_hub.egg-info → sdg_hub-0.5.1}/PKG-INFO +2 -2
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/transform-blocks.md +2 -2
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/flows/overview.md +348 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +2 -9
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/pyproject.toml +1 -1
- sdg_hub-0.5.1/scripts/packer/centos.pkr.hcl +52 -0
- sdg_hub-0.5.1/scripts/packer/setup-centos.sh +80 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/__init__.py +0 -22
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +57 -5
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/text_parser_block.py +57 -5
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/base.py +57 -80
- sdg_hub-0.5.1/src/sdg_hub/core/utils/temp_manager.py +57 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1/src/sdg_hub.egg-info}/PKG-INFO +2 -2
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/SOURCES.txt +4 -21
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/requires.txt +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_json_structure_block.py +1 -1
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_renameblock.py → sdg_hub-0.5.1/tests/blocks/transform/test_rename_columns.py +19 -19
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_uniform_col_val_setter.py +1 -1
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +73 -3
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tox.ini +2 -2
- sdg_hub-0.4.2/.github/workflows/e2e.yml +0 -103
- sdg_hub-0.4.2/.github/workflows/packer.yml +0 -15
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub-0.4.2/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub-0.4.2/src/sdg_hub/core/flow/migration.py +0 -198
- sdg_hub-0.4.2/tests/blocks/deprecated/test_llmblock.py +0 -148
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_combinecolumns.py +0 -168
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -112
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_flattenblock.py +0 -217
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -37
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_selectorblock.py +0 -144
- sdg_hub-0.4.2/tests/blocks/utilblocks/test_settomajority.py +0 -127
- sdg_hub-0.4.2/tests/flow/test_migration.py +0 -449
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/dependabot.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/mergify.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.github/workflows/pypi.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.gitignore +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.isort.cfg +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/.pylintrc +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/LICENSE +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/Makefile +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/.nojekyll +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_coverpage.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_navbar.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/_sidebar.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/api-reference.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/llm-blocks.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/concepts.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/development.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/index.html +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/installation.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/docs/quick-start.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/annotation_classification.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/knowledge_tuning/knowledge_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/scripts/ruff.sh +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/setup.cfg +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/time_estimator.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_llm_parser_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_base.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_integration.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_time_estimation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_flow_metrics.py +0 -0
- {sdg_hub-0.4.2 → sdg_hub-0.5.1}/tests/utils/test_path_resolution.py +0 -0
|
@@ -7,29 +7,11 @@ on:
|
|
|
7
7
|
branches:
|
|
8
8
|
- "main"
|
|
9
9
|
- "release-**"
|
|
10
|
-
paths:
|
|
11
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
|
12
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
|
13
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
|
14
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
|
15
|
-
- 'tests/integration/**/*.py'
|
|
16
|
-
- 'pyproject.toml'
|
|
17
|
-
- 'tox.ini'
|
|
18
|
-
- '.github/workflows/integration-test.yml'
|
|
19
10
|
pull_request:
|
|
20
11
|
branches:
|
|
21
12
|
- "main"
|
|
22
13
|
- "release-**"
|
|
23
14
|
types: [opened, synchronize, reopened, labeled]
|
|
24
|
-
paths:
|
|
25
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
|
26
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
|
27
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
|
28
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
|
29
|
-
- 'tests/integration/**/*.py'
|
|
30
|
-
- 'pyproject.toml'
|
|
31
|
-
- 'tox.ini'
|
|
32
|
-
- '.github/workflows/integration-test.yml'
|
|
33
15
|
|
|
34
16
|
env:
|
|
35
17
|
LC_ALL: en_US.UTF-8
|
|
@@ -42,19 +24,58 @@ permissions:
|
|
|
42
24
|
contents: read
|
|
43
25
|
|
|
44
26
|
jobs:
|
|
27
|
+
check-trigger:
|
|
28
|
+
name: "Check If Integration Should Run"
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
outputs:
|
|
31
|
+
should_run: ${{ steps.check.outputs.should_run }}
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
|
|
35
|
+
- uses: dorny/paths-filter@v3
|
|
36
|
+
id: filter
|
|
37
|
+
if: github.event_name == 'pull_request'
|
|
38
|
+
with:
|
|
39
|
+
filters: |
|
|
40
|
+
relevant:
|
|
41
|
+
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
|
42
|
+
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
|
43
|
+
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
|
44
|
+
# Standard integration test triggers, DONT CHANGE THIS
|
|
45
|
+
- 'tests/integration/**/*.py'
|
|
46
|
+
- 'pyproject.toml'
|
|
47
|
+
- 'tox.ini'
|
|
48
|
+
- '.github/workflows/integration-test.yml'
|
|
49
|
+
|
|
50
|
+
- name: Determine if tests should run
|
|
51
|
+
id: check
|
|
52
|
+
run: |
|
|
53
|
+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || [[ "${{ github.event_name }}" == "push" ]]; then
|
|
54
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
|
55
|
+
elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
56
|
+
# Check if from fork
|
|
57
|
+
if [[ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
|
|
58
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
|
59
|
+
# Check if labeled event with correct label
|
|
60
|
+
elif [[ "${{ github.event.action }}" == "labeled" ]] && [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-integration-tests') }}" == "true" ]]; then
|
|
61
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
|
62
|
+
# Check if relevant paths changed for non-labeled events
|
|
63
|
+
elif [[ "${{ github.event.action }}" != "labeled" ]] && [[ "${{ steps.filter.outputs.relevant }}" == "true" ]]; then
|
|
64
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
|
65
|
+
else
|
|
66
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
|
67
|
+
fi
|
|
68
|
+
else
|
|
69
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
|
70
|
+
fi
|
|
71
|
+
|
|
45
72
|
integration-test:
|
|
46
73
|
name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
|
|
47
74
|
runs-on: "${{ matrix.platform }}"
|
|
75
|
+
needs: check-trigger
|
|
76
|
+
if: needs.check-trigger.outputs.should_run == 'true'
|
|
48
77
|
# Require manual approval before running (via GitHub Environment)
|
|
49
78
|
environment: integration-tests
|
|
50
|
-
# Skip fork PRs (they can't access environment secrets anyway)
|
|
51
|
-
# Also check for 'run-integration-tests' label on labeled events
|
|
52
|
-
if: |
|
|
53
|
-
github.event_name == 'workflow_dispatch' ||
|
|
54
|
-
github.event_name == 'push' ||
|
|
55
|
-
(github.event_name == 'pull_request' &&
|
|
56
|
-
github.event.pull_request.head.repo.full_name == github.repository &&
|
|
57
|
-
(github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
|
|
58
79
|
strategy:
|
|
59
80
|
matrix:
|
|
60
81
|
python:
|
|
@@ -89,12 +110,9 @@ jobs:
|
|
|
89
110
|
**/pyproject.toml
|
|
90
111
|
**/requirements*.txt
|
|
91
112
|
|
|
92
|
-
- name: Remove llama-cpp-python from cache
|
|
93
|
-
run: |
|
|
94
|
-
pip cache remove llama_cpp_python
|
|
95
113
|
|
|
96
114
|
- name: Cache huggingface datasets
|
|
97
|
-
uses: actions/cache@
|
|
115
|
+
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
|
|
98
116
|
with:
|
|
99
117
|
path: ~/.cache/huggingface
|
|
100
118
|
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
|
@@ -111,10 +129,6 @@ jobs:
|
|
|
111
129
|
run: |
|
|
112
130
|
tox -e py3-integrationcov
|
|
113
131
|
|
|
114
|
-
- name: Remove llama-cpp-python from cache
|
|
115
|
-
if: always()
|
|
116
|
-
run: |
|
|
117
|
-
pip cache remove llama_cpp_python
|
|
118
132
|
|
|
119
133
|
- name: Upload integration test coverage to Codecov
|
|
120
134
|
uses: codecov/codecov-action@v4
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Build AMI with Packer
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
|
|
6
|
+
jobs:
|
|
7
|
+
build-ami:
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
permissions:
|
|
10
|
+
id-token: write # This is required for OIDC
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- name: Checkout repository
|
|
15
|
+
uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Configure AWS Credentials
|
|
18
|
+
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c
|
|
19
|
+
with:
|
|
20
|
+
role-to-assume: arn:aws:iam::851725220677:role/github-actions-packer-role
|
|
21
|
+
aws-region: us-east-2
|
|
22
|
+
role-session-name: github-actions-packer # For tracking in CloudTrail
|
|
23
|
+
|
|
24
|
+
- name: Setup Packer
|
|
25
|
+
uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232
|
|
26
|
+
|
|
27
|
+
- name: Build and create AMI
|
|
28
|
+
run: |
|
|
29
|
+
set -euo pipefail
|
|
30
|
+
cd scripts/packer
|
|
31
|
+
packer init .
|
|
32
|
+
packer validate .
|
|
33
|
+
packer build .
|
|
@@ -86,16 +86,7 @@ jobs:
|
|
|
86
86
|
**/pyproject.toml
|
|
87
87
|
**/requirements*.txt
|
|
88
88
|
|
|
89
|
-
- name: Remove llama-cpp-python from cache
|
|
90
|
-
run: |
|
|
91
|
-
pip cache remove llama_cpp_python
|
|
92
89
|
|
|
93
|
-
- name: Cache huggingface
|
|
94
|
-
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
|
95
|
-
with:
|
|
96
|
-
path: ~/.cache/huggingface
|
|
97
|
-
# config contains DEFAULT_MODEL
|
|
98
|
-
key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
|
|
99
90
|
|
|
100
91
|
- name: Install dependencies
|
|
101
92
|
run: |
|
|
@@ -107,10 +98,6 @@ jobs:
|
|
|
107
98
|
tox -e py3-unitcov
|
|
108
99
|
|
|
109
100
|
|
|
110
|
-
- name: Remove llama-cpp-python from cache
|
|
111
|
-
if: always()
|
|
112
|
-
run: |
|
|
113
|
-
pip cache remove llama_cpp_python
|
|
114
101
|
|
|
115
102
|
- name: Upload coverage to Codecov
|
|
116
103
|
uses: codecov/codecov-action@v4
|
|
@@ -86,7 +86,6 @@ The framework is built around a modular block system with **composability at its
|
|
|
86
86
|
- `transform/`: Data transformation blocks (column operations, text manipulation)
|
|
87
87
|
- `filtering/`: Data filtering blocks with quality thresholds
|
|
88
88
|
- `evaluation/`: Quality evaluation blocks (faithfulness, relevancy assessment)
|
|
89
|
-
- `deprecated_blocks/`: Legacy blocks maintained for backward compatibility
|
|
90
89
|
|
|
91
90
|
**Key Benefits**: Type-safe composition, automatic validation, rich logging, and high-performance async processing.
|
|
92
91
|
|
|
@@ -97,7 +96,6 @@ Flows orchestrate multiple blocks into data processing pipelines:
|
|
|
97
96
|
- **FlowRegistry** (`src/sdg_hub/core/flow/registry.py`): Registry for flow discovery
|
|
98
97
|
- **FlowMetadata** (`src/sdg_hub/core/flow/metadata.py`): Metadata and parameter definitions
|
|
99
98
|
- **FlowValidator** (`src/sdg_hub/core/flow/validation.py`): YAML structure validation
|
|
100
|
-
- **FlowMigration** (`src/sdg_hub/core/flow/migration.py`): Backward compatibility for old flow formats
|
|
101
99
|
|
|
102
100
|
### Flow Configuration
|
|
103
101
|
Flows are defined in YAML files with this structure:
|
|
@@ -148,11 +146,6 @@ All blocks operate on HuggingFace `datasets.Dataset` objects:
|
|
|
148
146
|
- Rich logging provides processing summaries
|
|
149
147
|
- Empty dataset handling with appropriate errors
|
|
150
148
|
|
|
151
|
-
### Backward Compatibility
|
|
152
|
-
The framework maintains compatibility with legacy formats:
|
|
153
|
-
- Deprecated blocks are preserved in `deprecated_blocks/`
|
|
154
|
-
- Flow migration automatically converts old YAML formats
|
|
155
|
-
- Legacy LLMBlocks receive special handling during execution
|
|
156
149
|
|
|
157
150
|
## Testing Guidelines
|
|
158
151
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
25
|
Requires-Dist: click<9.0.0,>=8.1.7
|
|
26
|
-
Requires-Dist: datasets
|
|
26
|
+
Requires-Dist: datasets>=4.0.0
|
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
|
28
28
|
Requires-Dist: jinja2
|
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
|
@@ -19,8 +19,8 @@ Maps values based on their position/index, useful for applying transformations b
|
|
|
19
19
|
### MeltColumnsBlock
|
|
20
20
|
Reshapes data from wide format to long format, converting multiple columns into key-value pairs.
|
|
21
21
|
|
|
22
|
-
###
|
|
23
|
-
|
|
22
|
+
### UniformColumnValueSetter
|
|
23
|
+
Replaces all values in a column with a single statistical aggregate (mode, min, max, mean, or median) computed from the data. Modifies the column in-place, useful for data normalization, creating baseline comparisons, or extracting dominant values.
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
## 🚀 Next Steps
|
|
@@ -116,7 +116,139 @@ metadata:
|
|
|
116
116
|
max_samples: 10000
|
|
117
117
|
```
|
|
118
118
|
|
|
119
|
-
|
|
119
|
+
### Metadata Fields Reference
|
|
120
|
+
|
|
121
|
+
The metadata section supports the following fields for flow configuration:
|
|
122
|
+
|
|
123
|
+
#### Core Metadata Fields
|
|
124
|
+
|
|
125
|
+
| Field | Type | Required | Default | Description |
|
|
126
|
+
|-------|------|----------|---------|-------------|
|
|
127
|
+
| `name` | `string` | Yes | - | Human-readable name of the flow. Must be at least 1 character. |
|
|
128
|
+
| `id` | `string` | No | Auto-generated | Unique identifier for the flow. Auto-generated from name if not provided. Must be lowercase, contain only alphanumeric characters and hyphens, and not start/end with hyphens. |
|
|
129
|
+
| `description` | `string` | No | `""` | Detailed description of what the flow does and its purpose. |
|
|
130
|
+
| `version` | `string` | No | `"1.0.0"` | Semantic version following the format `MAJOR.MINOR.PATCH` (e.g., "1.0.0", "2.1.3-beta"). |
|
|
131
|
+
| `author` | `string` | No | `""` | Name of the flow author or contributor. |
|
|
132
|
+
| `license` | `string` | No | `"Apache-2.0"` | License identifier for the flow (e.g., "Apache-2.0", "MIT", "GPL-3.0"). |
|
|
133
|
+
| `tags` | `List[string]` | No | `[]` | List of tags for categorization and discovery. Tags are automatically converted to lowercase. |
|
|
134
|
+
| `recommended_models` | `RecommendedModels` | No | `None` | Recommended LLM models for optimal flow performance. See below for structure. |
|
|
135
|
+
| `dataset_requirements` | `DatasetRequirements` | No | `None` | Input dataset requirements and validation rules. See below for structure. |
|
|
136
|
+
|
|
137
|
+
#### RecommendedModels Structure
|
|
138
|
+
|
|
139
|
+
The `recommended_models` field helps users choose appropriate LLM models for the flow:
|
|
140
|
+
|
|
141
|
+
```yaml
|
|
142
|
+
recommended_models:
|
|
143
|
+
default: "meta-llama/Llama-3.3-70B-Instruct"
|
|
144
|
+
compatible:
|
|
145
|
+
- "microsoft/phi-4"
|
|
146
|
+
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
|
147
|
+
experimental:
|
|
148
|
+
- "google/gemini-pro"
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
| Field | Type | Required | Default | Description |
|
|
152
|
+
|-------|------|----------|---------|-------------|
|
|
153
|
+
| `default` | `string` | Yes | - | The default model recommended for this flow. This is the primary model users should use. |
|
|
154
|
+
| `compatible` | `List[string]` | No | `[]` | List of models known to work well with this flow. Alternative options with good performance. |
|
|
155
|
+
| `experimental` | `List[string]` | No | `[]` | List of experimental models that may work but haven't been extensively tested with this flow. |
|
|
156
|
+
|
|
157
|
+
**Model Selection Behavior:**
|
|
158
|
+
|
|
159
|
+
When the framework needs to select a model, it prioritizes in this order:
|
|
160
|
+
1. `default` model if available
|
|
161
|
+
2. First available model from `compatible` list
|
|
162
|
+
3. First available model from `experimental` list
|
|
163
|
+
|
|
164
|
+
#### DatasetRequirements Structure
|
|
165
|
+
|
|
166
|
+
The `dataset_requirements` field validates input datasets and documents expected data format:
|
|
167
|
+
|
|
168
|
+
```yaml
|
|
169
|
+
dataset_requirements:
|
|
170
|
+
required_columns:
|
|
171
|
+
- "document"
|
|
172
|
+
- "context"
|
|
173
|
+
optional_columns:
|
|
174
|
+
- "metadata"
|
|
175
|
+
- "source"
|
|
176
|
+
min_samples: 1
|
|
177
|
+
max_samples: 10000
|
|
178
|
+
column_types:
|
|
179
|
+
document: "string"
|
|
180
|
+
context: "string"
|
|
181
|
+
description: "Documents with context for Q&A generation"
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
| Field | Type | Required | Default | Description |
|
|
185
|
+
|-------|------|----------|---------|-------------|
|
|
186
|
+
| `required_columns` | `List[string]` | No | `[]` | Column names that must be present in the input dataset. Flow validation will fail if these are missing. |
|
|
187
|
+
| `optional_columns` | `List[string]` | No | `[]` | Column names that are optional but can enhance flow performance if provided. |
|
|
188
|
+
| `min_samples` | `integer` | No | `1` | Minimum number of samples required in the input dataset. Must be at least 1. |
|
|
189
|
+
| `max_samples` | `integer` | No | `None` | Maximum number of samples to process. Useful for resource management and preventing excessive processing. |
|
|
190
|
+
| `column_types` | `Dict[string, string]` | No | `{}` | Expected data types for specific columns (e.g., "string", "integer", "float"). Used for documentation purposes. |
|
|
191
|
+
| `description` | `string` | No | `""` | Human-readable description of the dataset requirements and expected format. |
|
|
192
|
+
|
|
193
|
+
**Validation Behavior:**
|
|
194
|
+
|
|
195
|
+
- The flow will validate the input dataset against `required_columns` before execution
|
|
196
|
+
- Missing required columns will cause the flow to fail with a clear error message
|
|
197
|
+
- Sample count validation ensures the dataset meets `min_samples` and respects `max_samples` if set
|
|
198
|
+
- `max_samples` must be greater than or equal to `min_samples` if both are specified
|
|
199
|
+
|
|
200
|
+
#### Complete Metadata Example
|
|
201
|
+
|
|
202
|
+
Here's a comprehensive example using all available metadata fields:
|
|
203
|
+
|
|
204
|
+
```yaml
|
|
205
|
+
metadata:
|
|
206
|
+
name: "Advanced Document Q&A Generation"
|
|
207
|
+
id: "advanced-document-qa-generation"
|
|
208
|
+
description: |
|
|
209
|
+
A sophisticated flow that processes documents to generate high-quality
|
|
210
|
+
question-answer pairs with faithfulness evaluation and quality filtering.
|
|
211
|
+
Designed for educational content and training data generation.
|
|
212
|
+
version: "2.1.0"
|
|
213
|
+
author: "SDG Hub Team"
|
|
214
|
+
license: "Apache-2.0"
|
|
215
|
+
|
|
216
|
+
recommended_models:
|
|
217
|
+
default: "meta-llama/Llama-3.3-70B-Instruct"
|
|
218
|
+
compatible:
|
|
219
|
+
- "microsoft/phi-4"
|
|
220
|
+
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
|
221
|
+
- "meta-llama/Llama-3.1-70B-Instruct"
|
|
222
|
+
experimental:
|
|
223
|
+
- "google/gemini-pro"
|
|
224
|
+
- "anthropic/claude-3-opus"
|
|
225
|
+
|
|
226
|
+
tags:
|
|
227
|
+
- "question-generation"
|
|
228
|
+
- "document-processing"
|
|
229
|
+
- "educational"
|
|
230
|
+
- "qa-pairs"
|
|
231
|
+
|
|
232
|
+
dataset_requirements:
|
|
233
|
+
required_columns:
|
|
234
|
+
- "document"
|
|
235
|
+
- "context"
|
|
236
|
+
optional_columns:
|
|
237
|
+
- "domain"
|
|
238
|
+
- "difficulty_level"
|
|
239
|
+
- "source_url"
|
|
240
|
+
min_samples: 10
|
|
241
|
+
max_samples: 5000
|
|
242
|
+
column_types:
|
|
243
|
+
document: "string"
|
|
244
|
+
context: "string"
|
|
245
|
+
domain: "string"
|
|
246
|
+
difficulty_level: "integer"
|
|
247
|
+
description: |
|
|
248
|
+
Input dataset should contain documents with contextual information.
|
|
249
|
+
Each document should be well-formed text suitable for Q&A generation.
|
|
250
|
+
Optional domain and difficulty_level fields help tailor generation.
|
|
251
|
+
```
|
|
120
252
|
|
|
121
253
|
### Blocks Section
|
|
122
254
|
|
|
@@ -572,6 +704,221 @@ Checkpoint directories contain:
|
|
|
572
704
|
- If all samples are completed, Flow skips processing and returns merged results immediately
|
|
573
705
|
- Clean up checkpoint directories manually when no longer needed
|
|
574
706
|
|
|
707
|
+
## 📊 Flow Metrics and Reporting
|
|
708
|
+
|
|
709
|
+
SDG Hub automatically tracks and reports detailed execution metrics for every flow run, providing visibility into performance, data transformations, and success/failure status. This built-in monitoring system helps you understand bottlenecks, debug issues, and optimize your pipelines.
|
|
710
|
+
|
|
711
|
+
### Automatic Metrics Collection
|
|
712
|
+
|
|
713
|
+
The flow execution system automatically collects comprehensive metrics for each block without any configuration required:
|
|
714
|
+
|
|
715
|
+
**Collected Metrics:**
|
|
716
|
+
- **Block Identification** - Block name and type for clear tracking
|
|
717
|
+
- **Execution Time** - Precise timing for each block's execution
|
|
718
|
+
- **Row Changes** - Input and output row counts to track data filtering
|
|
719
|
+
- **Column Changes** - Added and removed columns to understand data transformations
|
|
720
|
+
- **Status** - Success or failure status for each block
|
|
721
|
+
- **Error Details** - Full error messages and types when blocks fail
|
|
722
|
+
|
|
723
|
+
### Rich Console Output
|
|
724
|
+
|
|
725
|
+
After every flow execution (whether successful or failed), a beautifully formatted summary table is automatically displayed in your terminal using the Rich library:
|
|
726
|
+
|
|
727
|
+
```python
|
|
728
|
+
from sdg_hub.core.flow import Flow
|
|
729
|
+
from datasets import Dataset
|
|
730
|
+
|
|
731
|
+
# Load and configure flow
|
|
732
|
+
flow = Flow.from_yaml("path/to/flow.yaml")
|
|
733
|
+
flow.set_model_config(
|
|
734
|
+
model="hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
|
|
735
|
+
api_base="http://localhost:8000/v1"
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Execute flow - metrics displayed automatically at completion
|
|
739
|
+
result = flow.generate(dataset)
|
|
740
|
+
```
|
|
741
|
+
|
|
742
|
+
**Example Console Output:**
|
|
743
|
+
|
|
744
|
+
```
|
|
745
|
+
┌─────────────────── Advanced Document Q&A Generation - Complete ───────────────────┐
|
|
746
|
+
│ Flow Execution Summary │
|
|
747
|
+
│ ┌──────────────────────┬─────────────────┬──────────┬──────────────┬─────────┬──┐│
|
|
748
|
+
│ │ Block Name │ Type │ Duration │ Rows │ Columns │ ││
|
|
749
|
+
│ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
|
|
750
|
+
│ │ backup_document │ DuplicateCol... │ 0.05s │ 100 → 100 │ +1 │ ✓││
|
|
751
|
+
│ │ build_question_... │ PromptBuilder...│ 0.12s │ 100 → 100 │ +1 │ ✓││
|
|
752
|
+
│ │ generate_question │ LLMChatBlock │ 45.30s │ 100 → 100 │ +1 │ ✓││
|
|
753
|
+
│ │ generate_answer │ LLMChatBlock │ 78.45s │ 100 → 100 │ +1 │ ✓││
|
|
754
|
+
│ │ eval_faithfulness... │ LLMChatBlock │ 52.20s │ 100 → 100 │ +1 │ ✓││
|
|
755
|
+
│ │ extract_eval_con... │ LLMParserBlock │ 0.15s │ 100 → 100 │ +2 │ ✓││
|
|
756
|
+
│ │ parse_evaluation │ TextParserBlock │ 0.22s │ 100 → 100 │ +2 │ ✓││
|
|
757
|
+
│ │ filter_faithful │ ColumnValueF... │ 0.08s │ 100 → 87 │ — │ ✓││
|
|
758
|
+
│ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
|
|
759
|
+
│ │ TOTAL │ 8 blocks │ 176.57s │ 87 final │ 9 final │ ✓││
|
|
760
|
+
│ └──────────────────────┴─────────────────┴──────────┴──────────────┴─────────┴──┘│
|
|
761
|
+
└─────────────────────────────────────────────────────────────────────────────────────┘
|
|
762
|
+
```
|
|
763
|
+
|
|
764
|
+
**Table Columns Explained:**
|
|
765
|
+
|
|
766
|
+
| Column | Description |
|
|
767
|
+
|--------|-------------|
|
|
768
|
+
| **Block Name** | The unique name of the block as defined in the flow YAML |
|
|
769
|
+
| **Type** | The block class name (e.g., LLMChatBlock, PromptBuilderBlock) |
|
|
770
|
+
| **Duration** | Execution time in seconds for that specific block |
|
|
771
|
+
| **Rows** | Row transformation showing `input_count → output_count` |
|
|
772
|
+
| **Columns** | Column changes: `+N` for added, `-N` for removed, `+N/-M` for both |
|
|
773
|
+
| **Status** | `✓` for success, `✗` for failure |
|
|
774
|
+
|
|
775
|
+
**Status Indicators:**
|
|
776
|
+
|
|
777
|
+
The panel border color and title reflect the overall execution status:
|
|
778
|
+
|
|
779
|
+
- **Green border + "Complete"** - All blocks executed successfully
|
|
780
|
+
- **Red border + "Failed"** - Flow execution failed (exception thrown)
|
|
781
|
+
- **Yellow border + "Partial"** - Some blocks completed but others failed
|
|
782
|
+
|
|
783
|
+
### JSON Metrics Export
|
|
784
|
+
|
|
785
|
+
For production workflows, detailed metrics can be automatically saved to JSON files for analysis, monitoring, and debugging:
|
|
786
|
+
|
|
787
|
+
```python
|
|
788
|
+
# Enable JSON metrics export by providing a log directory
|
|
789
|
+
result = flow.generate(
|
|
790
|
+
dataset,
|
|
791
|
+
log_dir="./flow_logs"
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
# Metrics automatically saved to: ./flow_logs/{flow_name}_{timestamp}_metrics.json
|
|
795
|
+
```
|
|
796
|
+
|
|
797
|
+
**JSON Structure:**
|
|
798
|
+
|
|
799
|
+
```json
|
|
800
|
+
{
|
|
801
|
+
"flow_name": "Advanced Document Q&A Generation",
|
|
802
|
+
"flow_version": "2.1.0",
|
|
803
|
+
"execution_timestamp": "20250113_143052",
|
|
804
|
+
"execution_successful": true,
|
|
805
|
+
"total_execution_time": 176.57,
|
|
806
|
+
"total_wall_time": 178.23,
|
|
807
|
+
"total_blocks": 8,
|
|
808
|
+
"successful_blocks": 8,
|
|
809
|
+
"failed_blocks": 0,
|
|
810
|
+
"block_metrics": [
|
|
811
|
+
{
|
|
812
|
+
"block_name": "backup_document",
|
|
813
|
+
"block_type": "DuplicateColumnsBlock",
|
|
814
|
+
"execution_time": 0.05,
|
|
815
|
+
"input_rows": 100,
|
|
816
|
+
"output_rows": 100,
|
|
817
|
+
"added_cols": ["original_document"],
|
|
818
|
+
"removed_cols": [],
|
|
819
|
+
"status": "success"
|
|
820
|
+
},
|
|
821
|
+
{
|
|
822
|
+
"block_name": "generate_question",
|
|
823
|
+
"block_type": "LLMChatBlock",
|
|
824
|
+
"execution_time": 45.30,
|
|
825
|
+
"input_rows": 100,
|
|
826
|
+
"output_rows": 100,
|
|
827
|
+
"added_cols": ["question"],
|
|
828
|
+
"removed_cols": [],
|
|
829
|
+
"status": "success"
|
|
830
|
+
}
|
|
831
|
+
]
|
|
832
|
+
}
|
|
833
|
+
```
|
|
834
|
+
|
|
835
|
+
**JSON Fields Reference:**
|
|
836
|
+
|
|
837
|
+
| Field | Type | Description |
|
|
838
|
+
|-------|------|-------------|
|
|
839
|
+
| `flow_name` | string | Human-readable flow name from metadata |
|
|
840
|
+
| `flow_version` | string | Flow version string |
|
|
841
|
+
| `execution_timestamp` | string | Timestamp when execution started (YYYYMMDD_HHMMSS format) |
|
|
842
|
+
| `execution_successful` | boolean | `true` if all blocks succeeded, `false` if any failed |
|
|
843
|
+
| `total_execution_time` | float | Sum of all block execution times in seconds |
|
|
844
|
+
| `total_wall_time` | float | End-to-end wall clock time including overhead |
|
|
845
|
+
| `total_blocks` | integer | Number of blocks in the flow |
|
|
846
|
+
| `successful_blocks` | integer | Count of blocks that executed successfully |
|
|
847
|
+
| `failed_blocks` | integer | Count of blocks that failed |
|
|
848
|
+
| `block_metrics` | array | Detailed metrics for each block (see below) |
|
|
849
|
+
|
|
850
|
+
**Block Metrics Fields:**
|
|
851
|
+
|
|
852
|
+
| Field | Type | Description |
|
|
853
|
+
|-------|------|-------------|
|
|
854
|
+
| `block_name` | string | Unique block identifier |
|
|
855
|
+
| `block_type` | string | Block class name |
|
|
856
|
+
| `execution_time` | float | Block execution duration in seconds |
|
|
857
|
+
| `input_rows` | integer | Number of rows received by the block |
|
|
858
|
+
| `output_rows` | integer | Number of rows produced by the block |
|
|
859
|
+
| `added_cols` | array | List of column names added by this block |
|
|
860
|
+
| `removed_cols` | array | List of column names removed by this block |
|
|
861
|
+
| `status` | string | `"success"` or `"failed"` |
|
|
862
|
+
| `error` | string | Error message (only present if `status` is `"failed"`) |
|
|
863
|
+
| `error_type` | string | Error class name (only present if `status` is `"failed"`) |
|
|
864
|
+
|
|
865
|
+
### Metrics Aggregation
|
|
866
|
+
|
|
867
|
+
When using checkpointing with `save_freq`, blocks may execute multiple times on different chunks of data. The metrics system automatically aggregates these executions per block:
|
|
868
|
+
|
|
869
|
+
- **Execution times** are summed across all chunks
|
|
870
|
+
- **Row counts** are totaled for input and output
|
|
871
|
+
- **Column changes** are merged (duplicates removed)
|
|
872
|
+
- **Status** reflects the worst case (any failure marks the block as failed)
|
|
873
|
+
|
|
874
|
+
This ensures the metrics summary and JSON export always show a cohesive view of the entire flow execution.
|
|
875
|
+
|
|
876
|
+
### Use Cases
|
|
877
|
+
|
|
878
|
+
**Performance Optimization:**
|
|
879
|
+
```python
|
|
880
|
+
# Identify slow blocks for optimization
|
|
881
|
+
result = flow.generate(dataset, log_dir="./optimization_analysis")
|
|
882
|
+
# Review metrics JSON to find blocks with high execution_time
|
|
883
|
+
```
|
|
884
|
+
|
|
885
|
+
**Data Quality Monitoring:**
|
|
886
|
+
```python
|
|
887
|
+
# Track how filtering affects dataset size
|
|
888
|
+
result = flow.generate(dataset)
|
|
889
|
+
# Check console output for row count changes: "100 → 87" indicates 13 filtered
|
|
890
|
+
```
|
|
891
|
+
|
|
892
|
+
**Production Monitoring:**
|
|
893
|
+
```python
|
|
894
|
+
# Continuous metrics collection for production pipelines
|
|
895
|
+
for batch in data_batches:
|
|
896
|
+
result = flow.generate(
|
|
897
|
+
batch,
|
|
898
|
+
log_dir=f"./production_metrics/{date}",
|
|
899
|
+
checkpoint_dir=f"./checkpoints/{batch_id}"
|
|
900
|
+
)
|
|
901
|
+
# Aggregate metrics JSON files for dashboards and alerting
|
|
902
|
+
```
|
|
903
|
+
|
|
904
|
+
**Debugging Failed Runs:**
|
|
905
|
+
```python
|
|
906
|
+
# Automatic error capture in metrics
|
|
907
|
+
try:
|
|
908
|
+
result = flow.generate(dataset, log_dir="./debug_logs")
|
|
909
|
+
except Exception as e:
|
|
910
|
+
# Metrics JSON contains full error details for failed blocks
|
|
911
|
+
print(f"Check ./debug_logs for detailed failure metrics")
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
### Important Notes
|
|
915
|
+
|
|
916
|
+
- **Always Displayed** - Metrics are shown even if the flow fails, helping debug issues
|
|
917
|
+
- **Zero Configuration** - No setup required, metrics collection is automatic
|
|
918
|
+
- **Minimal Overhead** - Metrics collection adds negligible performance impact
|
|
919
|
+
- **Thread-Safe** - Metrics are properly collected during concurrent block execution
|
|
920
|
+
- **Checkpoint Aware** - Metrics correctly aggregate across checkpointed chunks
|
|
921
|
+
|
|
575
922
|
## 🚀 Next Steps
|
|
576
923
|
|
|
577
924
|
Ready to master the flow system? Explore these detailed guides:
|
|
@@ -359,7 +359,7 @@
|
|
|
359
359
|
"processed_knowledge_dataset = processed_knowledge_dataset.remove_columns(['messages']).rename_column('messages_without_think', 'messages')\n",
|
|
360
360
|
"\n",
|
|
361
361
|
"cfg = RAFTConfig(k_passages=5, max_tokens_per_chunk=400, p_include_oracle=0.9)\n",
|
|
362
|
-
"raft_samples = build_raft_samples(
|
|
362
|
+
"raft_samples = build_raft_samples(processed_knowledge_dataset, cfg)\n",
|
|
363
363
|
"raft_samples = raft_samples.map(build_messages).remove_columns(['question', 'context', 'oracle_context', 'cot_answer', 'answer', 'instruction', 'type', 'meta'])\n",
|
|
364
364
|
"\n",
|
|
365
365
|
"fp = \"<Instruction/Skills dataset>\" # TODO: Replace with huggingface dataset path once its uploaded\n",
|
|
@@ -237,20 +237,13 @@ def build_messages(raft_record: Dict[str, Any]):
|
|
|
237
237
|
Output:
|
|
238
238
|
messages: list of {"role": "system"|"user"|"assistant", "content": str}
|
|
239
239
|
"""
|
|
240
|
-
# 1.
|
|
241
|
-
sys_msg = raft_record.get("instruction") or (
|
|
242
|
-
"You are a domain expert. You must answer questions by first quoting a span "
|
|
243
|
-
"verbatim from the relevant passage, then giving reasoning, then the final answer. "
|
|
244
|
-
"Ignore distractor passages."
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# 2. User message: serialize passages + question
|
|
240
|
+
# 1. User message: serialize passages + question
|
|
248
241
|
passages = "\n\n".join(
|
|
249
242
|
[f"[Passage {i+1}] {p}" for i, p in enumerate(raft_record["context"])]
|
|
250
243
|
)
|
|
251
244
|
user_msg = f"Passages:\n{passages}\n\nQuestion: {raft_record['question']}"
|
|
252
245
|
|
|
253
|
-
#
|
|
246
|
+
# 2. Assistant message: the gold output
|
|
254
247
|
assistant_msg = raft_record["answer"]
|
|
255
248
|
|
|
256
249
|
return {"messages" : [
|