sdg-hub 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/integration-test.yml +49 -32
- sdg_hub-0.5.0/.github/workflows/packer.yml +33 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/test.yml +0 -13
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/CLAUDE.md +0 -7
- {sdg_hub-0.4.1/src/sdg_hub.egg-info → sdg_hub-0.5.0}/PKG-INFO +2 -2
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/concepts.md +14 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/flows/discovery.md +38 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/flows/overview.md +35 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/quick-start.md +6 -3
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +4 -1
- sdg_hub-0.5.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +214 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +72 -43
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +57 -237
- sdg_hub-0.5.0/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +252 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +2 -2
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/pyproject.toml +1 -1
- sdg_hub-0.5.0/scripts/packer/centos.pkr.hcl +52 -0
- sdg_hub-0.5.0/scripts/packer/setup-centos.sh +80 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/__init__.py +0 -22
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/base.py +146 -81
- sdg_hub-0.5.0/src/sdg_hub/core/utils/__init__.py +21 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_metrics.py +116 -0
- sdg_hub-0.5.0/src/sdg_hub/core/utils/time_estimator.py +344 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +16 -10
- {sdg_hub-0.4.1 → sdg_hub-0.5.0/src/sdg_hub.egg-info}/PKG-INFO +2 -2
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/SOURCES.txt +9 -21
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/requires.txt +1 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/transform/test_json_structure_block.py +1 -1
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_renameblock.py → sdg_hub-0.5.0/tests/blocks/transform/test_rename_columns.py +19 -19
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/transform/test_uniform_col_val_setter.py +1 -1
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_base.py +75 -2
- sdg_hub-0.5.0/tests/flow/test_time_estimation.py +546 -0
- sdg_hub-0.5.0/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +1 -0
- sdg_hub-0.5.0/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +184 -0
- sdg_hub-0.5.0/tests/utils/test_flow_metrics.py +477 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tox.ini +2 -2
- sdg_hub-0.4.1/.github/workflows/e2e.yml +0 -103
- sdg_hub-0.4.1/.github/workflows/packer.yml +0 -15
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub-0.4.1/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub-0.4.1/src/sdg_hub/core/flow/migration.py +0 -198
- sdg_hub-0.4.1/src/sdg_hub/core/utils/__init__.py +0 -13
- sdg_hub-0.4.1/tests/blocks/deprecated/test_llmblock.py +0 -148
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_combinecolumns.py +0 -168
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -112
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_flattenblock.py +0 -217
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -37
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_selectorblock.py +0 -144
- sdg_hub-0.4.1/tests/blocks/utilblocks/test_settomajority.py +0 -127
- sdg_hub-0.4.1/tests/flow/test_migration.py +0 -449
- sdg_hub-0.4.1/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +0 -108
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/dependabot.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/mergify.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.github/workflows/pypi.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.gitignore +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.isort.cfg +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/.pylintrc +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/LICENSE +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/Makefile +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/.nojekyll +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/_coverpage.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/_navbar.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/_sidebar.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/api-reference.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/blocks/llm-blocks.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/development.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/index.html +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/docs/installation.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/annotation/annotation_classification.ipynb +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/annotation/news_classification_flow.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/annotation/news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/knowledge_tuning/knowledge_utils.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/scripts/ruff.sh +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/setup.cfg +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/llm/test_llm_parser_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_integration.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/integration/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/integration/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.4.1 → sdg_hub-0.5.0}/tests/utils/test_path_resolution.py +0 -0
@@ -7,28 +7,11 @@ on:
|
|
7
7
|
branches:
|
8
8
|
- "main"
|
9
9
|
- "release-**"
|
10
|
-
paths:
|
11
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
12
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
13
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
14
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
15
|
-
- 'tests/integration/**/*.py'
|
16
|
-
- 'pyproject.toml'
|
17
|
-
- 'tox.ini'
|
18
|
-
- '.github/workflows/integration-test.yml'
|
19
10
|
pull_request:
|
20
11
|
branches:
|
21
12
|
- "main"
|
22
13
|
- "release-**"
|
23
|
-
|
24
|
-
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
25
|
-
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
26
|
-
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
27
|
-
# Standard integration test triggers, DONT CHANGE THIS
|
28
|
-
- 'tests/integration/**/*.py'
|
29
|
-
- 'pyproject.toml'
|
30
|
-
- 'tox.ini'
|
31
|
-
- '.github/workflows/integration-test.yml'
|
14
|
+
types: [opened, synchronize, reopened, labeled]
|
32
15
|
|
33
16
|
env:
|
34
17
|
LC_ALL: en_US.UTF-8
|
@@ -41,17 +24,58 @@ permissions:
|
|
41
24
|
contents: read
|
42
25
|
|
43
26
|
jobs:
|
27
|
+
check-trigger:
|
28
|
+
name: "Check If Integration Should Run"
|
29
|
+
runs-on: ubuntu-latest
|
30
|
+
outputs:
|
31
|
+
should_run: ${{ steps.check.outputs.should_run }}
|
32
|
+
steps:
|
33
|
+
- uses: actions/checkout@v4
|
34
|
+
|
35
|
+
- uses: dorny/paths-filter@v3
|
36
|
+
id: filter
|
37
|
+
if: github.event_name == 'pull_request'
|
38
|
+
with:
|
39
|
+
filters: |
|
40
|
+
relevant:
|
41
|
+
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
42
|
+
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
43
|
+
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
44
|
+
# Standard integration test triggers, DONT CHANGE THIS
|
45
|
+
- 'tests/integration/**/*.py'
|
46
|
+
- 'pyproject.toml'
|
47
|
+
- 'tox.ini'
|
48
|
+
- '.github/workflows/integration-test.yml'
|
49
|
+
|
50
|
+
- name: Determine if tests should run
|
51
|
+
id: check
|
52
|
+
run: |
|
53
|
+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]] || [[ "${{ github.event_name }}" == "push" ]]; then
|
54
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
55
|
+
elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
56
|
+
# Check if from fork
|
57
|
+
if [[ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]]; then
|
58
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
59
|
+
# Check if labeled event with correct label
|
60
|
+
elif [[ "${{ github.event.action }}" == "labeled" ]] && [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-integration-tests') }}" == "true" ]]; then
|
61
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
62
|
+
# Check if relevant paths changed for non-labeled events
|
63
|
+
elif [[ "${{ github.event.action }}" != "labeled" ]] && [[ "${{ steps.filter.outputs.relevant }}" == "true" ]]; then
|
64
|
+
echo "should_run=true" >> "$GITHUB_OUTPUT"
|
65
|
+
else
|
66
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
67
|
+
fi
|
68
|
+
else
|
69
|
+
echo "should_run=false" >> "$GITHUB_OUTPUT"
|
70
|
+
fi
|
71
|
+
|
44
72
|
integration-test:
|
45
73
|
name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
|
46
74
|
runs-on: "${{ matrix.platform }}"
|
75
|
+
needs: check-trigger
|
76
|
+
if: needs.check-trigger.outputs.should_run == 'true'
|
47
77
|
# Require manual approval before running (via GitHub Environment)
|
48
78
|
environment: integration-tests
|
49
|
-
# Skip fork PRs (they can't access environment secrets anyway)
|
50
|
-
if: |
|
51
|
-
github.event_name == 'workflow_dispatch' ||
|
52
|
-
github.event_name == 'push' ||
|
53
|
-
(github.event_name == 'pull_request' &&
|
54
|
-
github.event.pull_request.head.repo.full_name == github.repository)
|
55
79
|
strategy:
|
56
80
|
matrix:
|
57
81
|
python:
|
@@ -86,12 +110,9 @@ jobs:
|
|
86
110
|
**/pyproject.toml
|
87
111
|
**/requirements*.txt
|
88
112
|
|
89
|
-
- name: Remove llama-cpp-python from cache
|
90
|
-
run: |
|
91
|
-
pip cache remove llama_cpp_python
|
92
113
|
|
93
114
|
- name: Cache huggingface datasets
|
94
|
-
uses: actions/cache@
|
115
|
+
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
|
95
116
|
with:
|
96
117
|
path: ~/.cache/huggingface
|
97
118
|
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
@@ -108,10 +129,6 @@ jobs:
|
|
108
129
|
run: |
|
109
130
|
tox -e py3-integrationcov
|
110
131
|
|
111
|
-
- name: Remove llama-cpp-python from cache
|
112
|
-
if: always()
|
113
|
-
run: |
|
114
|
-
pip cache remove llama_cpp_python
|
115
132
|
|
116
133
|
- name: Upload integration test coverage to Codecov
|
117
134
|
uses: codecov/codecov-action@v4
|
@@ -0,0 +1,33 @@
|
|
1
|
+
name: Build AMI with Packer
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_dispatch:
|
5
|
+
|
6
|
+
jobs:
|
7
|
+
build-ami:
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
permissions:
|
10
|
+
id-token: write # This is required for OIDC
|
11
|
+
contents: read
|
12
|
+
|
13
|
+
steps:
|
14
|
+
- name: Checkout repository
|
15
|
+
uses: actions/checkout@v4
|
16
|
+
|
17
|
+
- name: Configure AWS Credentials
|
18
|
+
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c
|
19
|
+
with:
|
20
|
+
role-to-assume: arn:aws:iam::851725220677:role/github-actions-packer-role
|
21
|
+
aws-region: us-east-2
|
22
|
+
role-session-name: github-actions-packer # For tracking in CloudTrail
|
23
|
+
|
24
|
+
- name: Setup Packer
|
25
|
+
uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232
|
26
|
+
|
27
|
+
- name: Build and create AMI
|
28
|
+
run: |
|
29
|
+
set -euo pipefail
|
30
|
+
cd scripts/packer
|
31
|
+
packer init .
|
32
|
+
packer validate .
|
33
|
+
packer build .
|
@@ -86,16 +86,7 @@ jobs:
|
|
86
86
|
**/pyproject.toml
|
87
87
|
**/requirements*.txt
|
88
88
|
|
89
|
-
- name: Remove llama-cpp-python from cache
|
90
|
-
run: |
|
91
|
-
pip cache remove llama_cpp_python
|
92
89
|
|
93
|
-
- name: Cache huggingface
|
94
|
-
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
95
|
-
with:
|
96
|
-
path: ~/.cache/huggingface
|
97
|
-
# config contains DEFAULT_MODEL
|
98
|
-
key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
|
99
90
|
|
100
91
|
- name: Install dependencies
|
101
92
|
run: |
|
@@ -107,10 +98,6 @@ jobs:
|
|
107
98
|
tox -e py3-unitcov
|
108
99
|
|
109
100
|
|
110
|
-
- name: Remove llama-cpp-python from cache
|
111
|
-
if: always()
|
112
|
-
run: |
|
113
|
-
pip cache remove llama_cpp_python
|
114
101
|
|
115
102
|
- name: Upload coverage to Codecov
|
116
103
|
uses: codecov/codecov-action@v4
|
@@ -86,7 +86,6 @@ The framework is built around a modular block system with **composability at its
|
|
86
86
|
- `transform/`: Data transformation blocks (column operations, text manipulation)
|
87
87
|
- `filtering/`: Data filtering blocks with quality thresholds
|
88
88
|
- `evaluation/`: Quality evaluation blocks (faithfulness, relevancy assessment)
|
89
|
-
- `deprecated_blocks/`: Legacy blocks maintained for backward compatibility
|
90
89
|
|
91
90
|
**Key Benefits**: Type-safe composition, automatic validation, rich logging, and high-performance async processing.
|
92
91
|
|
@@ -97,7 +96,6 @@ Flows orchestrate multiple blocks into data processing pipelines:
|
|
97
96
|
- **FlowRegistry** (`src/sdg_hub/core/flow/registry.py`): Registry for flow discovery
|
98
97
|
- **FlowMetadata** (`src/sdg_hub/core/flow/metadata.py`): Metadata and parameter definitions
|
99
98
|
- **FlowValidator** (`src/sdg_hub/core/flow/validation.py`): YAML structure validation
|
100
|
-
- **FlowMigration** (`src/sdg_hub/core/flow/migration.py`): Backward compatibility for old flow formats
|
101
99
|
|
102
100
|
### Flow Configuration
|
103
101
|
Flows are defined in YAML files with this structure:
|
@@ -148,11 +146,6 @@ All blocks operate on HuggingFace `datasets.Dataset` objects:
|
|
148
146
|
- Rich logging provides processing summaries
|
149
147
|
- Empty dataset handling with appropriate errors
|
150
148
|
|
151
|
-
### Backward Compatibility
|
152
|
-
The framework maintains compatibility with legacy formats:
|
153
|
-
- Deprecated blocks are preserved in `deprecated_blocks/`
|
154
|
-
- Flow migration automatically converts old YAML formats
|
155
|
-
- Legacy LLMBlocks receive special handling during execution
|
156
149
|
|
157
150
|
## Testing Guidelines
|
158
151
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sdg_hub
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Synthetic Data Generation
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
6
|
License: Apache-2.0
|
@@ -23,7 +23,7 @@ Requires-Python: >=3.10
|
|
23
23
|
Description-Content-Type: text/markdown
|
24
24
|
License-File: LICENSE
|
25
25
|
Requires-Dist: click<9.0.0,>=8.1.7
|
26
|
-
Requires-Dist: datasets
|
26
|
+
Requires-Dist: datasets>=4.0.0
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
28
28
|
Requires-Dist: jinja2
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
@@ -148,9 +148,22 @@ Every block validates data at runtime:
|
|
148
148
|
## 🚀 Best Practices
|
149
149
|
|
150
150
|
### 1. Start Small
|
151
|
-
- Use `dry_run()` to test with small samples
|
151
|
+
- Use `dry_run()` to test with small samples before processing full datasets
|
152
|
+
- Add `enable_time_estimation=True` to predict execution time for the complete dataset
|
152
153
|
- Validate your pipeline before scaling up
|
153
154
|
|
155
|
+
```python
|
156
|
+
# Test AND estimate in one call
|
157
|
+
result = flow.dry_run(dataset, sample_size=5, enable_time_estimation=True, max_concurrency=100)
|
158
|
+
|
159
|
+
# Access dry run results
|
160
|
+
print(f"Tested with {result['sample_size']} samples")
|
161
|
+
print(f"Output columns: {result['final_dataset']['columns']}")
|
162
|
+
|
163
|
+
# Time estimation is automatically displayed in a Rich table format
|
164
|
+
# No need to access it programmatically - the table shows all estimation details
|
165
|
+
```
|
166
|
+
|
154
167
|
### 2. Layer Validation
|
155
168
|
- Use basic block composition (PromptBuilder → LLMChat → Parser → Filter) to assess quality
|
156
169
|
- Implement filtering to maintain data standards
|
@@ -67,7 +67,44 @@ for flow_name in all_flows:
|
|
67
67
|
|
68
68
|
### Getting Flow Information
|
69
69
|
|
70
|
-
|
70
|
+
Access detailed flow metadata and configuration:
|
71
|
+
|
72
|
+
```python
|
73
|
+
from sdg_hub.core.flow import FlowRegistry, Flow
|
74
|
+
|
75
|
+
# Get metadata for a specific flow
|
76
|
+
flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
|
77
|
+
metadata = FlowRegistry.get_flow_metadata(flow_name)
|
78
|
+
|
79
|
+
if metadata:
|
80
|
+
print(f"Flow: {metadata.name}")
|
81
|
+
print(f"Version: {metadata.version}")
|
82
|
+
print(f"Author: {metadata.author}")
|
83
|
+
print(f"Description: {metadata.description}")
|
84
|
+
print(f"Tags: {', '.join(metadata.tags)}")
|
85
|
+
print(f"Recommended model: {metadata.recommended_models.get('default', 'Not specified')}")
|
86
|
+
|
87
|
+
# Load flow and get detailed information
|
88
|
+
flow_path = FlowRegistry.get_flow_path(flow_name)
|
89
|
+
flow = Flow.from_yaml(flow_path)
|
90
|
+
|
91
|
+
# Get comprehensive flow info
|
92
|
+
info = flow.get_info()
|
93
|
+
print(f"Total blocks: {info['total_blocks']}")
|
94
|
+
print(f"Block sequence: {', '.join(info['block_names'])}")
|
95
|
+
|
96
|
+
# Get dataset requirements
|
97
|
+
requirements = flow.get_dataset_requirements()
|
98
|
+
if requirements:
|
99
|
+
print(f"Required columns: {requirements.required_columns}")
|
100
|
+
print(f"Description: {requirements.description}")
|
101
|
+
print(f"Min samples: {requirements.min_samples}")
|
102
|
+
|
103
|
+
# Get model recommendations
|
104
|
+
recommendations = flow.get_model_recommendations()
|
105
|
+
print(f"Default model: {recommendations.get('default')}")
|
106
|
+
print(f"Compatible models: {recommendations.get('compatible', [])}")
|
107
|
+
```
|
71
108
|
|
72
109
|
### Getting Flow Paths
|
73
110
|
|
@@ -292,6 +292,41 @@ print(f"Output columns: {dry_result['final_dataset']['columns']}")
|
|
292
292
|
print(f"Sample output: {dry_result['sample_output']}")
|
293
293
|
```
|
294
294
|
|
295
|
+
### Time Estimation
|
296
|
+
|
297
|
+
Predict execution time for your full dataset before running:
|
298
|
+
|
299
|
+
```python
|
300
|
+
# Get dry run results AND time estimation in one call
|
301
|
+
result = flow.dry_run(
|
302
|
+
dataset,
|
303
|
+
sample_size=5,
|
304
|
+
enable_time_estimation=True,
|
305
|
+
max_concurrency=100
|
306
|
+
)
|
307
|
+
|
308
|
+
# Time estimation is automatically displayed in a Rich table format
|
309
|
+
# The table shows estimated time, total API requests, and per-block breakdowns
|
310
|
+
print(f"Dry run completed with {result['sample_size']} samples")
|
311
|
+
print(f"Output columns: {result['final_dataset']['columns']}")
|
312
|
+
```
|
313
|
+
|
314
|
+
**How It Works:**
|
315
|
+
|
316
|
+
The estimation uses 2 dry runs to accurately predict execution time:
|
317
|
+
- Extracts startup overhead (one-time costs)
|
318
|
+
- Calculates per-sample throughput (variable costs)
|
319
|
+
- Uses linear regression to separate fixed from variable costs
|
320
|
+
|
321
|
+
**Accuracy:**
|
322
|
+
- Includes a 20% conservative buffer to account for API variability
|
323
|
+
- Typical accuracy: within 15-40% of actual runtime depending on workload
|
324
|
+
- Better to finish early than run over time!
|
325
|
+
|
326
|
+
**When to Use:**
|
327
|
+
- Before processing with your full dataset
|
328
|
+
- To identify bottleneck blocks and optimize your pipeline
|
329
|
+
|
295
330
|
### Runtime Parameters
|
296
331
|
|
297
332
|
Runtime parameters allow you to customize block behavior at execution time without modifying flow YAML files. You can override global parameters for all blocks or configure specific blocks individually.
|
@@ -62,11 +62,14 @@ dataset = Dataset.from_dict({
|
|
62
62
|
'icl_response_3': ['Java provides platform independence and strong object-oriented features.']
|
63
63
|
})
|
64
64
|
|
65
|
-
# Test with a small sample
|
66
|
-
print("🧪 Running dry run...")
|
67
|
-
dry_result = flow.dry_run(dataset, sample_size=
|
65
|
+
# Test with a small sample AND get time estimate (recommended!)
|
66
|
+
print("🧪 Running dry run with time estimation...")
|
67
|
+
dry_result = flow.dry_run(dataset, sample_size=5, enable_time_estimation=True, max_concurrency=100)
|
68
68
|
print(f"✅ Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
|
69
69
|
print(f"📊 Output columns: {list(dry_result['final_dataset']['columns'])}")
|
70
|
+
|
71
|
+
# Time estimation is automatically displayed in a Rich table format
|
72
|
+
# The table shows estimated time, total API calls, and per-block breakdowns
|
70
73
|
```
|
71
74
|
|
72
75
|
## 📊 Step 3: Generate Synthetic Data
|
@@ -16,6 +16,7 @@ VLLM_MODEL=hosted_vllm/meta-llama/Llama-3.3-70B-Instruct
|
|
16
16
|
VLLM_API_BASE=http://localhost:8000/v1
|
17
17
|
VLLM_API_KEY=EMPTY
|
18
18
|
ENABLE_REASONING=false
|
19
|
+
MAX_CONCURRENCY=50
|
19
20
|
# =============================================================================
|
20
21
|
# OPENAI CONFIGURATION
|
21
22
|
# =============================================================================
|
@@ -38,7 +39,9 @@ MAAS_API_KEY=your-maas-api-key-here
|
|
38
39
|
# =============================================================================
|
39
40
|
# DATA CONFIGURATION
|
40
41
|
# =============================================================================
|
41
|
-
SEED_DATA_PATH=
|
42
|
+
SEED_DATA_PATH=seed_data.jsonl
|
43
|
+
# Set this for subsampling the seed data. Useful for debugging or running validation
|
44
|
+
SEED_DATA_SUBSAMPLE=24
|
42
45
|
OUTPUT_DATA_FOLDER=output_data
|
43
46
|
RUN_ON_VALIDATION_SET=true
|
44
47
|
NUMBER_OF_SUMMARIES=50
|
@@ -0,0 +1,214 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "markdown",
|
5
|
+
"id": "83f458de",
|
6
|
+
"metadata": {},
|
7
|
+
"source": [
|
8
|
+
"# Document Pre-processing for Knowledge Tuning\n",
|
9
|
+
"\n",
|
10
|
+
"## Overview\n",
|
11
|
+
"\n",
|
12
|
+
"This notebook demonstrates a complete document preprocessing pipeline designed specifically for **knowledge tuning** with sdg-hub. \n",
|
13
|
+
"\n",
|
14
|
+
"## What This Notebook Does\n",
|
15
|
+
"\n",
|
16
|
+
"This preprocessing pipeline transforms raw documents (PDFs, Word docs, etc.) into seed data for data generation:\n",
|
17
|
+
"\n",
|
18
|
+
"1. **Document Parsing**: Converts raw documents to structured markdown format\n",
|
19
|
+
"2. **Chunking**: Splits documents into manageable chunks while preserving structure and context\n",
|
20
|
+
"3. **Seed Data Creation**: Formats chunks with in-context learning (ICL) templates for effective knowledge tuning\n",
|
21
|
+
"\n",
|
22
|
+
"## Prerequisites\n",
|
23
|
+
"\n",
|
24
|
+
"- We will use the existing InstructLab document parser (`docparser_v2.py`) and Document parsing configuration (`docling_v2_config.yaml`)\n",
|
25
|
+
"- Raw pdf documents in the `document_collection/` directory\n"
|
26
|
+
]
|
27
|
+
},
|
28
|
+
{
|
29
|
+
"cell_type": "code",
|
30
|
+
"execution_count": null,
|
31
|
+
"id": "daa22c74",
|
32
|
+
"metadata": {},
|
33
|
+
"outputs": [],
|
34
|
+
"source": [
|
35
|
+
"# Step 1: Document Processing Pipeline\n",
|
36
|
+
"# Define the directory containing raw documents to be processed\n",
|
37
|
+
"data_dir = 'document_collection/'\n",
|
38
|
+
"\n",
|
39
|
+
"# Run the document parser to convert documents to markdown\n",
|
40
|
+
"# - input-dir: Directory containing source documents\n",
|
41
|
+
"# - output-dir: Directory where processed markdown files will be saved\n",
|
42
|
+
"# - c: Configuration file specifying parsing parameters\n",
|
43
|
+
"!python ../instructlab/docparser_v2.py --input-dir {data_dir} --output-dir {data_dir} -c ../instructlab/docling_v2_config.yaml"
|
44
|
+
]
|
45
|
+
},
|
46
|
+
{
|
47
|
+
"cell_type": "code",
|
48
|
+
"execution_count": null,
|
49
|
+
"id": "295749b5",
|
50
|
+
"metadata": {},
|
51
|
+
"outputs": [],
|
52
|
+
"source": [
|
53
|
+
"# Step 2: Install Required Dependencies\n",
|
54
|
+
"# Install packages needed for document processing and text chunking\n",
|
55
|
+
"\n",
|
56
|
+
"%pip install docling markdown-it-py\n",
|
57
|
+
"%pip install --upgrade transformers"
|
58
|
+
]
|
59
|
+
},
|
60
|
+
{
|
61
|
+
"cell_type": "code",
|
62
|
+
"execution_count": null,
|
63
|
+
"id": "dd8a4a2a",
|
64
|
+
"metadata": {},
|
65
|
+
"outputs": [],
|
66
|
+
"source": [
|
67
|
+
"# Step 3: Load Processed Document\n",
|
68
|
+
"import glob\n",
|
69
|
+
"\n",
|
70
|
+
"# In our example above docling step produces markdown of all the pdf files in the document_collection\n",
|
71
|
+
"with open(glob.glob(f'{data_dir}/*.md')[0], 'r') as f:\n",
|
72
|
+
" text = f.read()"
|
73
|
+
]
|
74
|
+
},
|
75
|
+
{
|
76
|
+
"cell_type": "code",
|
77
|
+
"execution_count": null,
|
78
|
+
"id": "7614dc73",
|
79
|
+
"metadata": {},
|
80
|
+
"outputs": [],
|
81
|
+
"source": [
|
82
|
+
"# Step 4: Text Chunking and Dataset Creation\n",
|
83
|
+
"\n",
|
84
|
+
"from markdown_it import MarkdownIt \n",
|
85
|
+
"from typing import List\n",
|
86
|
+
"import datasets \n",
|
87
|
+
"\n",
|
88
|
+
"\n",
|
89
|
+
"def chunk_markdown(\n",
|
90
|
+
" text: str,\n",
|
91
|
+
" max_tokens: int = 200,\n",
|
92
|
+
" overlap: int = 50\n",
|
93
|
+
") -> List[str]:\n",
|
94
|
+
" \"\"\"\n",
|
95
|
+
" Splits Markdown text into chunks at block-level elements\n",
|
96
|
+
" (headings, paragraphs, lists, tables, code, blockquotes).\n",
|
97
|
+
" Adds overlap (in words) between all consecutive chunks.\n",
|
98
|
+
" \n",
|
99
|
+
" Args:\n",
|
100
|
+
" text: The markdown text to be chunked\n",
|
101
|
+
" max_tokens: Maximum number of words per chunk\n",
|
102
|
+
" overlap: Number of overlapping words between consecutive chunks\n",
|
103
|
+
" \n",
|
104
|
+
" Returns:\n",
|
105
|
+
" List of text chunks with specified overlap\n",
|
106
|
+
" \"\"\"\n",
|
107
|
+
"\n",
|
108
|
+
" # Initialize markdown parser to understand document structure\n",
|
109
|
+
" md = MarkdownIt()\n",
|
110
|
+
" tokens = md.parse(text)\n",
|
111
|
+
"\n",
|
112
|
+
" # Group tokens into block-level segments to preserve markdown structure\n",
|
113
|
+
" # This ensures we don't split in the middle of headings, lists, etc.\n",
|
114
|
+
" blocks = []\n",
|
115
|
+
" buf = []\n",
|
116
|
+
" for tok in tokens:\n",
|
117
|
+
" if tok.block and tok.type.endswith(\"_open\"):\n",
|
118
|
+
" buf = []\n",
|
119
|
+
" elif tok.block and tok.type.endswith(\"_close\"):\n",
|
120
|
+
" if buf:\n",
|
121
|
+
" blocks.append(\"\\n\".join(buf).strip())\n",
|
122
|
+
" buf = []\n",
|
123
|
+
" elif tok.content:\n",
|
124
|
+
" buf.append(tok.content)\n",
|
125
|
+
" if buf:\n",
|
126
|
+
" blocks.append(\"\\n\".join(buf).strip())\n",
|
127
|
+
"\n",
|
128
|
+
" # Split blocks into chunks with overlap to maintain context continuity\n",
|
129
|
+
" chunks = []\n",
|
130
|
+
" current_words = []\n",
|
131
|
+
" for block in blocks:\n",
|
132
|
+
" words = block.split()\n",
|
133
|
+
" for w in words:\n",
|
134
|
+
" current_words.append(w)\n",
|
135
|
+
" if len(current_words) >= max_tokens:\n",
|
136
|
+
" # Emit a complete chunk\n",
|
137
|
+
" chunks.append(\" \".join(current_words))\n",
|
138
|
+
" # Prepare next buffer with overlap from the end of this chunk\n",
|
139
|
+
" # This ensures context continuity between chunks\n",
|
140
|
+
" current_words = current_words[-overlap:] if overlap > 0 else []\n",
|
141
|
+
"\n",
|
142
|
+
" # Add any remaining words as the final chunk\n",
|
143
|
+
" if current_words:\n",
|
144
|
+
" chunks.append(\" \".join(current_words))\n",
|
145
|
+
"\n",
|
146
|
+
" return chunks\n",
|
147
|
+
"\n",
|
148
|
+
"\n",
|
149
|
+
"chunks = chunk_markdown(text, max_tokens=5000, overlap=1000)\n",
|
150
|
+
"\n",
|
151
|
+
"\n",
|
152
|
+
"# Prepare seed data for the SDG-Hub knowledge pipeline.\n",
|
153
|
+
"# \n",
|
154
|
+
"# The seed data requires the following fields:\n",
|
155
|
+
"# - document_outline: A concise title or summary that accurately represents the entire document.\n",
|
156
|
+
"# For documents covering multiple themes, consider providing multiple outlines (one per section).\n",
|
157
|
+
"# - icl_document: A representative sample extract from the document. This may include tables, code snippets, definitions, etc.\n",
|
158
|
+
"# - icl_query_1, icl_query_2, icl_query_3: Three questions based on the icl_document sample.\n",
|
159
|
+
"# - domain: The domain or subject area of the document.\n",
|
160
|
+
"#\n",
|
161
|
+
"# The code below creates a HuggingFace Dataset from the document chunks,\n",
|
162
|
+
"# then maps the required ICL fields to each entry, and finally saves the result as a JSONL file.\n",
|
163
|
+
"\n",
|
164
|
+
"seed_data = datasets.Dataset.from_dict({'document': chunks})\n",
|
165
|
+
"\n",
|
166
|
+
"icl = {\n",
|
167
|
+
" \"document_outline\": \"The document contains excerpts from FINTRAC regulations designed to combat money laundering and terrorist financing in Canada\",\n",
|
168
|
+
" \"icl_document\": \"## Overview\\n\\nThis guidance came into effect on June 1, 2021.\\n\\n\\nThis guidance explains the methods that can be used by reporting entities\\n(REs) to verify the identity of a person or an entity.\\n\\n\\n## 1. Meaning of verifying the identity of a person or an entity\\n\\nIt means to use the methods described in this guidance to ensure that the\\ninformation in an identification document or from other informational\\nsources matches the information that the person or entity provided.\\n\\n\\nVerifying identity is a foundational element of Canada's anti-money\\nlaundering and anti-terrorist financing regime and a key component of an\\nRE's relationship with clients. It helps you to know your clients and to\\nunderstand and assess any risk that may be associated to their\\ntransactions or activities.\\n\\n\\n## 2. How to verify the identity of a person\\n\\nYou can use any of the 5 methods described below to identify a person:\\n\\n- 2.1 Government-issued photo identification method\\n\\n- 2.2 Credit file method\\n\\n- 2.3 Dual-process method\\n\\n- 2.4 Affiliate or member method\\n\\n- 2.5 Reliance method\\n\",\n",
|
169
|
+
" \"icl_query_1\": \"In Canada, what are the methods for verifying someone's identity?\",\n",
|
170
|
+
" \"icl_query_2\": \"In Canada, why is it important to confirm a client's identity?\",\n",
|
171
|
+
" \"icl_query_3\": \"In Canada, can I use Reliance method to verify identity of a person?\",\n",
|
172
|
+
" \"domain\": \"Finance\"\n",
|
173
|
+
"}\n",
|
174
|
+
"\n",
|
175
|
+
"# Map the ICL fields to each document chunk (if you want to use the same ICL for all, as shown here)\n",
|
176
|
+
"seed_data = seed_data.map(lambda x: icl)\n",
|
177
|
+
"\n",
|
178
|
+
"# Save the seed data to a JSONL file for downstream use\n",
|
179
|
+
"seed_data.to_json('seed_data.jsonl', orient='records', lines=True)"
|
180
|
+
]
|
181
|
+
},
|
182
|
+
{
|
183
|
+
"cell_type": "markdown",
|
184
|
+
"id": "44f3ff7f",
|
185
|
+
"metadata": {},
|
186
|
+
"source": [
|
187
|
+
"### Next Steps:\n",
|
188
|
+
"- The seed_data.jsonl file is now ready for the knowledge tuning pipeline.\n",
|
189
|
+
"- You can now refer to the [knowledge generation](knowledge_generation.ipynb) notebook"
|
190
|
+
]
|
191
|
+
}
|
192
|
+
],
|
193
|
+
"metadata": {
|
194
|
+
"kernelspec": {
|
195
|
+
"display_name": "sdg_hub",
|
196
|
+
"language": "python",
|
197
|
+
"name": "python3"
|
198
|
+
},
|
199
|
+
"language_info": {
|
200
|
+
"codemirror_mode": {
|
201
|
+
"name": "ipython",
|
202
|
+
"version": 3
|
203
|
+
},
|
204
|
+
"file_extension": ".py",
|
205
|
+
"mimetype": "text/x-python",
|
206
|
+
"name": "python",
|
207
|
+
"nbconvert_exporter": "python",
|
208
|
+
"pygments_lexer": "ipython3",
|
209
|
+
"version": "3.11.12"
|
210
|
+
}
|
211
|
+
},
|
212
|
+
"nbformat": 4,
|
213
|
+
"nbformat_minor": 5
|
214
|
+
}
|