sdg-hub 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.2.1/src/sdg_hub.egg-info → sdg_hub-0.2.2}/PKG-INFO +40 -15
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/README.md +39 -13
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/llm-blocks.md +52 -5
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/overview.md +10 -9
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/concepts.md +7 -1
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/development.md +5 -4
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/flows/overview.md +45 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/quick-start.md +7 -2
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/annotation/annotation_classification.ipynb +128 -153
- sdg_hub-0.2.2/examples/annotation/news_classification_flow.yaml +185 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/README.md +1 -1
- sdg_hub-0.2.2/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/pyproject.toml +0 -1
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/_version.py +16 -3
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
- sdg_hub-0.2.2/src/sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +323 -0
- sdg_hub-0.2.2/src/sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +323 -0
- sdg_hub-0.2.2/src/sdg_hub/core/blocks/evaluation/verify_question_block.py +329 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/client_manager.py +61 -24
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/config.py +1 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +62 -7
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +277 -115
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -2
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/registry.py +48 -34
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/base.py +131 -10
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/datautils.py +29 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -7
- {sdg_hub-0.2.1 → sdg_hub-0.2.2/src/sdg_hub.egg-info}/PKG-INFO +40 -15
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub.egg-info/SOURCES.txt +4 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub.egg-info/requires.txt +0 -1
- sdg_hub-0.2.2/tests/blocks/evaluation/test_evaluate_faithfulness_block.py +271 -0
- sdg_hub-0.2.2/tests/blocks/evaluation/test_evaluate_relevancy_block.py +189 -0
- sdg_hub-0.2.2/tests/blocks/evaluation/test_verify_question_block.py +331 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/llm/test_llm_chat_block.py +2 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +471 -10
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/test_registry.py +196 -32
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/conftest.py +1 -1
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_base.py +156 -2
- sdg_hub-0.2.2/tests/flow/test_dataset_requirements.py +419 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_integration.py +2 -2
- sdg_hub-0.2.2/tests/utils/test_datautils.py +43 -0
- sdg_hub-0.2.1/src/sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +0 -564
- sdg_hub-0.2.1/src/sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +0 -564
- sdg_hub-0.2.1/src/sdg_hub/core/blocks/evaluation/verify_question_block.py +0 -564
- sdg_hub-0.2.1/tests/blocks/evaluation/test_evaluate_faithfulness_block.py +0 -496
- sdg_hub-0.2.1/tests/blocks/evaluation/test_evaluate_relevancy_block.py +0 -493
- sdg_hub-0.2.1/tests/blocks/evaluation/test_verify_question_block.py +0 -480
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/dependabot.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/mergify.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/e2e.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/pypi.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.github/workflows/test.yml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.gitignore +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.isort.cfg +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/.pylintrc +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/CLAUDE.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/LICENSE +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/Makefile +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/.nojekyll +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/README.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/_coverpage.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/_navbar.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/_sidebar.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/api-reference.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/evaluation-blocks.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/index.html +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/docs/installation.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/annotation/news_classification_prompt.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/examples/knowledge_tuning/knowledge_utils.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/scripts/ruff.sh +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/setup.cfg +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/evaluation/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/migration.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/deprecated/test_llmblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/evaluation/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/testdata/test_verify_question.yaml +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_renameblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/blocks/utilblocks/test_settomajority.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_migration.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tests/utils/test_path_resolution.py +0 -0
- {sdg_hub-0.2.1 → sdg_hub-0.2.2}/tox.ini +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sdg_hub
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: Synthetic Data Generation
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
6
|
License: Apache-2.0
|
@@ -27,7 +27,6 @@ Requires-Dist: datasets<4.0.0,>=2.18.0
|
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
28
28
|
Requires-Dist: jinja2
|
29
29
|
Requires-Dist: litellm<1.75.0,>=1.73.0
|
30
|
-
Requires-Dist: openai<2.0.0,>=1.13.3
|
31
30
|
Requires-Dist: rich
|
32
31
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
32
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
@@ -92,6 +91,8 @@ A modular Python framework for building synthetic data generation pipelines usin
|
|
92
91
|
|
93
92
|
**📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
|
94
93
|
|
94
|
+
**📋 Dataset Schema Discovery** - Instantly discover required data formats. Get empty datasets with correct schema for easy validation and data preparation.
|
95
|
+
|
95
96
|
**🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
|
96
97
|
|
97
98
|
|
@@ -176,22 +177,46 @@ flow.set_model_config(
|
|
176
177
|
api_key="your_key",
|
177
178
|
)
|
178
179
|
```
|
179
|
-
####
|
180
|
+
#### Discover dataset requirements and create your dataset
|
180
181
|
```python
|
181
|
-
#
|
182
|
-
dataset
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
'
|
190
|
-
'
|
191
|
-
'
|
192
|
-
'
|
182
|
+
# First, discover what data the flow needs
|
183
|
+
# Get an empty dataset with the exact schema needed
|
184
|
+
schema_dataset = flow.get_dataset_schema() # Get empty dataset with correct schema
|
185
|
+
print(f"Required columns: {schema_dataset.column_names}")
|
186
|
+
print(f"Schema: {schema_dataset.features}")
|
187
|
+
|
188
|
+
# Option 1: Add data directly to the schema dataset
|
189
|
+
dataset = schema_dataset.add_item({
|
190
|
+
'document': 'Your document text here...',
|
191
|
+
'document_outline': '1. Topic A; 2. Topic B; 3. Topic C',
|
192
|
+
'domain': 'Computer Science',
|
193
|
+
'icl_document': 'Example document for in-context learning...',
|
194
|
+
'icl_query_1': 'Example question 1?',
|
195
|
+
'icl_response_1': 'Example answer 1',
|
196
|
+
'icl_query_2': 'Example question 2?',
|
197
|
+
'icl_response_2': 'Example answer 2',
|
198
|
+
'icl_query_3': 'Example question 3?',
|
199
|
+
'icl_response_3': 'Example answer 3'
|
193
200
|
})
|
194
201
|
|
202
|
+
# Option 2: Create your own dataset and validate the schema
|
203
|
+
my_dataset = Dataset.from_dict(my_data_dict)
|
204
|
+
if my_dataset.features == schema_dataset.features:
|
205
|
+
print("✅ Schema matches - ready to generate!")
|
206
|
+
dataset = my_dataset
|
207
|
+
else:
|
208
|
+
print("❌ Schema mismatch - check your columns")
|
209
|
+
|
210
|
+
# Option 3: Get raw requirements for detailed inspection
|
211
|
+
requirements = flow.get_dataset_requirements()
|
212
|
+
if requirements:
|
213
|
+
print(f"Required: {requirements.required_columns}")
|
214
|
+
print(f"Optional: {requirements.optional_columns}")
|
215
|
+
print(f"Min samples: {requirements.min_samples}")
|
216
|
+
```
|
217
|
+
|
218
|
+
#### Dry Run and Generate
|
219
|
+
```python
|
195
220
|
# Quick Testing with Dry Run
|
196
221
|
dry_result = flow.dry_run(dataset, sample_size=1)
|
197
222
|
print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
|
@@ -24,6 +24,8 @@ A modular Python framework for building synthetic data generation pipelines usin
|
|
24
24
|
|
25
25
|
**📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
|
26
26
|
|
27
|
+
**📋 Dataset Schema Discovery** - Instantly discover required data formats. Get empty datasets with correct schema for easy validation and data preparation.
|
28
|
+
|
27
29
|
**🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
|
28
30
|
|
29
31
|
|
@@ -108,22 +110,46 @@ flow.set_model_config(
|
|
108
110
|
api_key="your_key",
|
109
111
|
)
|
110
112
|
```
|
111
|
-
####
|
113
|
+
#### Discover dataset requirements and create your dataset
|
112
114
|
```python
|
113
|
-
#
|
114
|
-
dataset
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
'
|
122
|
-
'
|
123
|
-
'
|
124
|
-
'
|
115
|
+
# First, discover what data the flow needs
|
116
|
+
# Get an empty dataset with the exact schema needed
|
117
|
+
schema_dataset = flow.get_dataset_schema() # Get empty dataset with correct schema
|
118
|
+
print(f"Required columns: {schema_dataset.column_names}")
|
119
|
+
print(f"Schema: {schema_dataset.features}")
|
120
|
+
|
121
|
+
# Option 1: Add data directly to the schema dataset
|
122
|
+
dataset = schema_dataset.add_item({
|
123
|
+
'document': 'Your document text here...',
|
124
|
+
'document_outline': '1. Topic A; 2. Topic B; 3. Topic C',
|
125
|
+
'domain': 'Computer Science',
|
126
|
+
'icl_document': 'Example document for in-context learning...',
|
127
|
+
'icl_query_1': 'Example question 1?',
|
128
|
+
'icl_response_1': 'Example answer 1',
|
129
|
+
'icl_query_2': 'Example question 2?',
|
130
|
+
'icl_response_2': 'Example answer 2',
|
131
|
+
'icl_query_3': 'Example question 3?',
|
132
|
+
'icl_response_3': 'Example answer 3'
|
125
133
|
})
|
126
134
|
|
135
|
+
# Option 2: Create your own dataset and validate the schema
|
136
|
+
my_dataset = Dataset.from_dict(my_data_dict)
|
137
|
+
if my_dataset.features == schema_dataset.features:
|
138
|
+
print("✅ Schema matches - ready to generate!")
|
139
|
+
dataset = my_dataset
|
140
|
+
else:
|
141
|
+
print("❌ Schema mismatch - check your columns")
|
142
|
+
|
143
|
+
# Option 3: Get raw requirements for detailed inspection
|
144
|
+
requirements = flow.get_dataset_requirements()
|
145
|
+
if requirements:
|
146
|
+
print(f"Required: {requirements.required_columns}")
|
147
|
+
print(f"Optional: {requirements.optional_columns}")
|
148
|
+
print(f"Min samples: {requirements.min_samples}")
|
149
|
+
```
|
150
|
+
|
151
|
+
#### Dry Run and Generate
|
152
|
+
```python
|
127
153
|
# Quick Testing with Dry Run
|
128
154
|
dry_result = flow.dry_run(dataset, sample_size=1)
|
129
155
|
print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
|
@@ -34,12 +34,9 @@ The unified chat block that replaces provider-specific implementations with a si
|
|
34
34
|
### Basic Usage
|
35
35
|
|
36
36
|
```python
|
37
|
-
from sdg_hub.core.blocks import
|
37
|
+
from sdg_hub.core.blocks import LLMChatBlock
|
38
38
|
from datasets import Dataset
|
39
39
|
|
40
|
-
# Get the LLM chat block
|
41
|
-
LLMChatBlock = BlockRegistry.get_block("LLMChatBlock")
|
42
|
-
|
43
40
|
# Configure for OpenAI
|
44
41
|
chat_block = LLMChatBlock(
|
45
42
|
block_name="question_answerer",
|
@@ -133,7 +130,7 @@ dataset = Dataset.from_dict({
|
|
133
130
|
})
|
134
131
|
```
|
135
132
|
|
136
|
-
#### Async Processing
|
133
|
+
#### Async Processing & Concurrency Control
|
137
134
|
```python
|
138
135
|
chat_block = LLMChatBlock(
|
139
136
|
block_name="async_chat",
|
@@ -147,6 +144,56 @@ chat_block = LLMChatBlock(
|
|
147
144
|
result = chat_block.generate(large_dataset)
|
148
145
|
```
|
149
146
|
|
147
|
+
**Flow-Level Concurrency Control:**
|
148
|
+
|
149
|
+
When using LLM blocks within flows, you can control concurrency to prevent overwhelming API servers or hitting rate limits:
|
150
|
+
|
151
|
+
```python
|
152
|
+
from sdg_hub import Flow
|
153
|
+
|
154
|
+
# Load a flow with LLM blocks
|
155
|
+
flow = Flow.from_yaml("path/to/your/flow.yaml")
|
156
|
+
flow.set_model_config(model="openai/gpt-4o", api_key="your-key")
|
157
|
+
|
158
|
+
# Control concurrency for each LLM block in the flow
|
159
|
+
result = flow.generate(
|
160
|
+
dataset,
|
161
|
+
max_concurrency=5 # Max 5 concurrent requests at any time
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
**Benefits of Concurrency Control:**
|
166
|
+
- **Rate Limit Management** - Prevent API throttling by limiting concurrent requests
|
167
|
+
- **Resource Control** - Manage memory and network usage for large datasets
|
168
|
+
- **Provider-Friendly** - Respect API provider recommendations for concurrent requests
|
169
|
+
- **Automatic Scaling** - No concurrency limit = maximum parallelism for fastest processing
|
170
|
+
|
171
|
+
**How It Works:**
|
172
|
+
|
173
|
+
The unified async system automatically detects whether you're processing single or multiple messages and applies concurrency control appropriately:
|
174
|
+
|
175
|
+
```python
|
176
|
+
# Single message - processed immediately
|
177
|
+
single_message = [{"role": "user", "content": "Hello"}]
|
178
|
+
|
179
|
+
# Multiple messages - concurrency controlled via semaphore
|
180
|
+
batch_messages = [
|
181
|
+
[{"role": "user", "content": "Question 1"}],
|
182
|
+
[{"role": "user", "content": "Question 2"}],
|
183
|
+
[{"role": "user", "content": "Question 3"}],
|
184
|
+
# ... up to thousands of messages
|
185
|
+
]
|
186
|
+
|
187
|
+
# Both cases use the same unified API under the hood
|
188
|
+
# Concurrency is managed transparently
|
189
|
+
```
|
190
|
+
|
191
|
+
**Performance Guidelines:**
|
192
|
+
- **Small datasets (<100 samples)**: No concurrency limit needed
|
193
|
+
- **Medium datasets (100-1000 samples)**: `max_concurrency=10-20`
|
194
|
+
- **Large datasets (1000+ samples)**: `max_concurrency=5-10` (respect API limits)
|
195
|
+
- **Production workloads**: Start conservative and tune based on error rates
|
196
|
+
|
150
197
|
### Message Format
|
151
198
|
|
152
199
|
LLMChatBlock expects messages in OpenAI chat format:
|
@@ -30,14 +30,15 @@ All blocks inherit from `BaseBlock`, which provides:
|
|
30
30
|
|
31
31
|
### Standard Configuration
|
32
32
|
```python
|
33
|
-
|
33
|
+
# Import the specific block you need
|
34
|
+
from sdg_hub.core.blocks import LLMChatBlock
|
34
35
|
|
35
36
|
# Every block has these standard fields
|
36
|
-
|
37
|
-
block = MyBlock(
|
37
|
+
block = LLMChatBlock(
|
38
38
|
block_name="my_unique_block", # Required: unique identifier
|
39
|
-
input_cols=["
|
40
|
-
output_cols=["
|
39
|
+
input_cols=["input_text"], # Column this block needs
|
40
|
+
output_cols=["response"], # Column this block creates
|
41
|
+
model="openai/gpt-4o", # Required: provider/model format
|
41
42
|
# ... block-specific configuration
|
42
43
|
)
|
43
44
|
```
|
@@ -86,13 +87,13 @@ print(f"Found {len(available_blocks)} blocks")
|
|
86
87
|
|
87
88
|
### 2. Block Instantiation
|
88
89
|
```python
|
89
|
-
#
|
90
|
-
|
90
|
+
# Import the specific block you need
|
91
|
+
from sdg_hub.core.blocks import LLMChatBlock
|
91
92
|
|
92
93
|
# Create an instance with configuration
|
93
|
-
chat_block =
|
94
|
+
chat_block = LLMChatBlock(
|
94
95
|
block_name="question_answerer",
|
95
|
-
|
96
|
+
model="openai/gpt-4o",
|
96
97
|
input_cols=["question"],
|
97
98
|
output_cols=["answer"],
|
98
99
|
prompt_template="Answer this question: {question}"
|
@@ -159,7 +159,13 @@ Every block validates data at runtime:
|
|
159
159
|
- Watch execution logs for bottlenecks
|
160
160
|
- Use async-friendly blocks for LLM operations
|
161
161
|
|
162
|
-
### 4.
|
162
|
+
### 4. Optimize for Scale
|
163
|
+
- Use `max_concurrency` parameter to control API request rates
|
164
|
+
- Start with conservative concurrency limits (5-10) for production
|
165
|
+
- Increase concurrency carefully while monitoring error rates
|
166
|
+
- Consider provider-specific rate limits and costs
|
167
|
+
|
168
|
+
### 5. Design for Reuse
|
163
169
|
- Create modular flows that can be combined
|
164
170
|
- Use parameters for customization points
|
165
171
|
|
@@ -123,15 +123,16 @@ Create comprehensive tests following this pattern:
|
|
123
123
|
|
124
124
|
import pytest
|
125
125
|
from datasets import Dataset
|
126
|
-
from sdg_hub.core.blocks import BlockRegistry
|
127
126
|
from sdg_hub.core.utils.error_handling import MissingColumnError
|
127
|
+
# Import your custom block directly
|
128
|
+
from .my_new_block import MyNewBlock
|
128
129
|
|
129
130
|
class TestMyNewBlock:
|
130
131
|
"""Test suite for MyNewBlock."""
|
131
132
|
|
132
133
|
def test_basic_functionality(self):
|
133
134
|
"""Test basic block functionality."""
|
134
|
-
block =
|
135
|
+
block = MyNewBlock(
|
135
136
|
block_name="test_block",
|
136
137
|
input_cols=["input"],
|
137
138
|
output_cols=["output"]
|
@@ -149,7 +150,7 @@ class TestMyNewBlock:
|
|
149
150
|
def test_configuration_validation(self):
|
150
151
|
"""Test parameter validation."""
|
151
152
|
with pytest.raises(ValueError):
|
152
|
-
|
153
|
+
MyNewBlock(
|
153
154
|
block_name="bad_config",
|
154
155
|
input_cols=["input"],
|
155
156
|
output_cols=["output"],
|
@@ -158,7 +159,7 @@ class TestMyNewBlock:
|
|
158
159
|
|
159
160
|
def test_missing_columns(self):
|
160
161
|
"""Test error handling for missing columns."""
|
161
|
-
block =
|
162
|
+
block = MyNewBlock(
|
162
163
|
block_name="test_block",
|
163
164
|
input_cols=["missing_column"],
|
164
165
|
output_cols=["output"]
|
@@ -296,6 +296,51 @@ result = flow.generate(
|
|
296
296
|
)
|
297
297
|
```
|
298
298
|
|
299
|
+
### Concurrency Control
|
300
|
+
|
301
|
+
For flows containing LLM blocks, you can control the maximum number of concurrent API requests to prevent overwhelming servers or hitting rate limits:
|
302
|
+
|
303
|
+
```python
|
304
|
+
# Basic concurrency control
|
305
|
+
result = flow.generate(
|
306
|
+
dataset,
|
307
|
+
max_concurrency=5 # Max 5 concurrent requests per LLM block execution
|
308
|
+
)
|
309
|
+
|
310
|
+
# Combined with other parameters
|
311
|
+
result = flow.generate(
|
312
|
+
dataset,
|
313
|
+
max_concurrency=10,
|
314
|
+
runtime_params={
|
315
|
+
"temperature": 0.7,
|
316
|
+
"max_tokens": 200
|
317
|
+
}
|
318
|
+
)
|
319
|
+
```
|
320
|
+
|
321
|
+
**When to Use Concurrency Control:**
|
322
|
+
|
323
|
+
- **Large Datasets** - Process thousands of samples without overwhelming APIs
|
324
|
+
- **Rate Limit Management** - Respect provider-specific concurrent request limits
|
325
|
+
- **Production Workloads** - Ensure stable, predictable resource usage
|
326
|
+
- **Cost Optimization** - Prevent burst API charges from uncontrolled parallelism
|
327
|
+
|
328
|
+
**Recommended Settings:**
|
329
|
+
|
330
|
+
```python
|
331
|
+
# Conservative (recommended for production)
|
332
|
+
result = flow.generate(dataset, max_concurrency=5)
|
333
|
+
|
334
|
+
# Moderate (good for development/testing)
|
335
|
+
result = flow.generate(dataset, max_concurrency=10)
|
336
|
+
|
337
|
+
# Aggressive (only for robust APIs and small datasets)
|
338
|
+
result = flow.generate(dataset, max_concurrency=20)
|
339
|
+
|
340
|
+
# No limit (maximum speed, use with caution)
|
341
|
+
result = flow.generate(dataset) # Default behavior
|
342
|
+
```
|
343
|
+
|
299
344
|
## 🚀 Next Steps
|
300
345
|
|
301
346
|
Ready to master the flow system? Explore these detailed guides:
|
@@ -107,11 +107,16 @@ print(f"🔎 QA Generation Flows: {qa_flows}")
|
|
107
107
|
eval_flows = FlowRegistry.search_flows(tag="evaluation")
|
108
108
|
print(f"📊 Evaluation Flows: {eval_flows}")
|
109
109
|
|
110
|
+
# List all blocks by categories
|
111
|
+
all_blocks = BlockRegistry.list_blocks(grouped=True)
|
112
|
+
for category, blocks in all_blocks.items():
|
113
|
+
print(f"Blocks for category {category}: {blocks}")
|
114
|
+
|
110
115
|
# Find blocks by category
|
111
|
-
llm_blocks = BlockRegistry.
|
116
|
+
llm_blocks = BlockRegistry.list_blocks(category="llm")
|
112
117
|
print(f"🧠 LLM Blocks: {llm_blocks}")
|
113
118
|
|
114
|
-
transform_blocks = BlockRegistry.
|
119
|
+
transform_blocks = BlockRegistry.list_blocks(category="transform")
|
115
120
|
print(f"🔄 Transform Blocks: {transform_blocks}")
|
116
121
|
```
|
117
122
|
|