sdg-hub 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub-0.4.1/.github/workflows/integration-test.yml +140 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/test.yml +6 -3
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.gitignore +5 -0
- {sdg_hub-0.4.0/src/sdg_hub.egg-info → sdg_hub-0.4.1}/PKG-INFO +2 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/blocks/llm-blocks.md +236 -2
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/flows/overview.md +180 -12
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/quick-start.md +25 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/annotation/news_classification_flow.yaml +0 -5
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -10
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/pyproject.toml +8 -2
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/__init__.py +0 -2
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/__init__.py +1 -2
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/__init__.py +3 -4
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/base.py +4 -69
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/metadata.py +1 -68
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/registry.py +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -2
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1/src/sdg_hub.egg-info}/PKG-INFO +2 -1
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub.egg-info/SOURCES.txt +7 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub.egg-info/requires.txt +1 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_base.py +1 -46
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_integration.py +0 -32
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_metadata.py +1 -73
- sdg_hub-0.4.1/tests/integration/README.md +95 -0
- sdg_hub-0.4.1/tests/integration/__init__.py +3 -0
- sdg_hub-0.4.1/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +62 -0
- sdg_hub-0.4.1/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +1 -0
- sdg_hub-0.4.1/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +64 -0
- sdg_hub-0.4.1/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +108 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tox.ini +31 -3
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/dependabot.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/mergify.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/docs.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/e2e.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/packer.yml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.github/workflows/pypi.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.isort.cfg +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/.pylintrc +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/CLAUDE.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/LICENSE +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/Makefile +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/.nojekyll +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/_coverpage.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/_navbar.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/_sidebar.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/api-reference.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/concepts.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/development.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/index.html +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/docs/installation.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/annotation/annotation_classification.ipynb +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/annotation/news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/knowledge_tuning/knowledge_utils.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/scripts/ruff.sh +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/setup.cfg +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/migration.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/deprecated/test_llmblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/llm/test_llm_parser_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/transform/test_json_structure_block.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_renameblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/blocks/utilblocks/test_settomajority.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_migration.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.4.0 → sdg_hub-0.4.1}/tests/utils/test_path_resolution.py +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
name: Integration Test
|
4
|
+
on:
|
5
|
+
workflow_dispatch:
|
6
|
+
push:
|
7
|
+
branches:
|
8
|
+
- "main"
|
9
|
+
- "release-**"
|
10
|
+
paths:
|
11
|
+
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
12
|
+
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
13
|
+
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
14
|
+
# Standard integration test triggers, DONT CHANGE THIS
|
15
|
+
- 'tests/integration/**/*.py'
|
16
|
+
- 'pyproject.toml'
|
17
|
+
- 'tox.ini'
|
18
|
+
- '.github/workflows/integration-test.yml'
|
19
|
+
pull_request:
|
20
|
+
branches:
|
21
|
+
- "main"
|
22
|
+
- "release-**"
|
23
|
+
paths:
|
24
|
+
# Only trigger on changes to relevant flows and examples (EXTEND THIS):
|
25
|
+
- 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
|
26
|
+
- 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
|
27
|
+
# Standard integration test triggers, DONT CHANGE THIS
|
28
|
+
- 'tests/integration/**/*.py'
|
29
|
+
- 'pyproject.toml'
|
30
|
+
- 'tox.ini'
|
31
|
+
- '.github/workflows/integration-test.yml'
|
32
|
+
|
33
|
+
env:
|
34
|
+
LC_ALL: en_US.UTF-8
|
35
|
+
|
36
|
+
defaults:
|
37
|
+
run:
|
38
|
+
shell: bash
|
39
|
+
|
40
|
+
permissions:
|
41
|
+
contents: read
|
42
|
+
|
43
|
+
jobs:
|
44
|
+
integration-test:
|
45
|
+
name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
|
46
|
+
runs-on: "${{ matrix.platform }}"
|
47
|
+
# Require manual approval before running (via GitHub Environment)
|
48
|
+
environment: integration-tests
|
49
|
+
# Skip fork PRs (they can't access environment secrets anyway)
|
50
|
+
if: |
|
51
|
+
github.event_name == 'workflow_dispatch' ||
|
52
|
+
github.event_name == 'push' ||
|
53
|
+
(github.event_name == 'pull_request' &&
|
54
|
+
github.event.pull_request.head.repo.full_name == github.repository)
|
55
|
+
strategy:
|
56
|
+
matrix:
|
57
|
+
python:
|
58
|
+
- "3.11"
|
59
|
+
platform:
|
60
|
+
- "ubuntu-latest"
|
61
|
+
steps:
|
62
|
+
- name: "Harden Runner"
|
63
|
+
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
|
64
|
+
with:
|
65
|
+
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
|
66
|
+
|
67
|
+
- name: Checkout
|
68
|
+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
69
|
+
with:
|
70
|
+
# https://github.com/actions/checkout/issues/249
|
71
|
+
fetch-depth: 0
|
72
|
+
|
73
|
+
- name: Free disk space
|
74
|
+
uses: ./.github/actions/free-disk-space
|
75
|
+
|
76
|
+
- name: Install the expect package
|
77
|
+
run: |
|
78
|
+
sudo apt-get install -y expect
|
79
|
+
|
80
|
+
- name: Setup Python ${{ matrix.python }}
|
81
|
+
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
82
|
+
with:
|
83
|
+
python-version: ${{ matrix.python }}
|
84
|
+
cache: pip
|
85
|
+
cache-dependency-path: |
|
86
|
+
**/pyproject.toml
|
87
|
+
**/requirements*.txt
|
88
|
+
|
89
|
+
- name: Remove llama-cpp-python from cache
|
90
|
+
run: |
|
91
|
+
pip cache remove llama_cpp_python
|
92
|
+
|
93
|
+
- name: Cache huggingface datasets
|
94
|
+
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
95
|
+
with:
|
96
|
+
path: ~/.cache/huggingface
|
97
|
+
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
98
|
+
key: huggingface-${{ hashFiles('examples/**/*.ipynb') }}
|
99
|
+
|
100
|
+
- name: Install dependencies
|
101
|
+
run: |
|
102
|
+
python -m pip install --upgrade pip
|
103
|
+
python -m pip install tox tox-gh>=1.2
|
104
|
+
|
105
|
+
- name: Run integration tests with tox
|
106
|
+
env:
|
107
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
108
|
+
run: |
|
109
|
+
tox -e py3-integrationcov
|
110
|
+
|
111
|
+
- name: Remove llama-cpp-python from cache
|
112
|
+
if: always()
|
113
|
+
run: |
|
114
|
+
pip cache remove llama_cpp_python
|
115
|
+
|
116
|
+
- name: Upload integration test coverage to Codecov
|
117
|
+
uses: codecov/codecov-action@v4
|
118
|
+
with:
|
119
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
120
|
+
file: ./coverage-py3-integrationcov.xml
|
121
|
+
fail_ci_if_error: false
|
122
|
+
flags: integration
|
123
|
+
|
124
|
+
- name: Upload integration test artifacts
|
125
|
+
uses: actions/upload-artifact@v4
|
126
|
+
if: always()
|
127
|
+
with:
|
128
|
+
name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
|
129
|
+
path: |
|
130
|
+
coverage-py3-integrationcov/
|
131
|
+
coverage-py3-integrationcov.xml
|
132
|
+
durations/py3-integrationcov.html
|
133
|
+
retention-days: 30
|
134
|
+
|
135
|
+
integration-test-workflow-complete:
|
136
|
+
needs: ["integration-test"]
|
137
|
+
runs-on: ubuntu-latest
|
138
|
+
steps:
|
139
|
+
- name: Integration Test Workflow Complete
|
140
|
+
run: echo "Integration Test Workflow Complete"
|
@@ -9,7 +9,8 @@ on:
|
|
9
9
|
- "main"
|
10
10
|
- "release-**"
|
11
11
|
paths:
|
12
|
-
- '
|
12
|
+
- 'src/**/*.py'
|
13
|
+
- 'tests/**/*.py'
|
13
14
|
- 'pyproject.toml'
|
14
15
|
- 'requirements*.txt'
|
15
16
|
- 'tox.ini'
|
@@ -19,7 +20,8 @@ on:
|
|
19
20
|
- "main"
|
20
21
|
- "release-**"
|
21
22
|
paths:
|
22
|
-
- '
|
23
|
+
- 'src/**/*.py'
|
24
|
+
- 'tests/**/*.py'
|
23
25
|
- 'pyproject.toml'
|
24
26
|
- 'requirements*.txt'
|
25
27
|
- 'tox.ini'
|
@@ -37,7 +39,7 @@ permissions:
|
|
37
39
|
|
38
40
|
jobs:
|
39
41
|
test:
|
40
|
-
name: "${{ matrix.python }} on ${{ matrix.platform }}"
|
42
|
+
name: "Unit Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
|
41
43
|
runs-on: "${{ matrix.platform }}"
|
42
44
|
strategy:
|
43
45
|
matrix:
|
@@ -104,6 +106,7 @@ jobs:
|
|
104
106
|
run: |
|
105
107
|
tox -e py3-unitcov
|
106
108
|
|
109
|
+
|
107
110
|
- name: Remove llama-cpp-python from cache
|
108
111
|
if: always()
|
109
112
|
run: |
|
@@ -84,6 +84,11 @@ target/
|
|
84
84
|
# Jupyter Notebook
|
85
85
|
.ipynb_checkpoints
|
86
86
|
|
87
|
+
# Integration test artifacts
|
88
|
+
tests/integration/**/converted_scripts/
|
89
|
+
tests/integration/**/test_output/
|
90
|
+
tests/integration/**/output_data/
|
91
|
+
|
87
92
|
# IPython
|
88
93
|
profile_default/
|
89
94
|
ipython_config.py
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sdg_hub
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Synthetic Data Generation
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
6
|
License: Apache-2.0
|
@@ -65,6 +65,7 @@ Requires-Dist: pytest-html; extra == "dev"
|
|
65
65
|
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
66
66
|
Requires-Dist: ruff; extra == "dev"
|
67
67
|
Requires-Dist: pytest-env; extra == "dev"
|
68
|
+
Requires-Dist: nbconvert>=7.0.0; extra == "dev"
|
68
69
|
Dynamic: license-file
|
69
70
|
|
70
71
|
# `sdg_hub`: Synthetic Data Generation Toolkit
|
@@ -230,10 +230,244 @@ Constructs prompts from templates and data with validation and formatting suppor
|
|
230
230
|
|
231
231
|
## 🔍 TextParserBlock
|
232
232
|
|
233
|
-
Extracts structured data from LLM responses using
|
233
|
+
Extracts structured data from LLM responses using tag-based parsing or custom regex patterns. Essential for parsing LLM outputs into structured fields.
|
234
234
|
|
235
|
-
|
235
|
+
### Basic Tag-Based Parsing
|
236
236
|
|
237
|
+
Extract content between start and end tags:
|
238
|
+
|
239
|
+
```python
|
240
|
+
from sdg_hub.core.blocks import TextParserBlock
|
241
|
+
from datasets import Dataset
|
242
|
+
|
243
|
+
# Single field extraction
|
244
|
+
parser = TextParserBlock(
|
245
|
+
block_name="extract_answer",
|
246
|
+
input_cols=["llm_response"],
|
247
|
+
output_cols=["answer"],
|
248
|
+
start_tags=["<answer>"],
|
249
|
+
end_tags=["</answer>"]
|
250
|
+
)
|
251
|
+
|
252
|
+
dataset = Dataset.from_dict({
|
253
|
+
"llm_response": [
|
254
|
+
"Question analysis: ...\n<answer>Machine learning is a subset of AI.</answer>",
|
255
|
+
"Let me think...\n<answer>Neural networks process data in layers.</answer>"
|
256
|
+
]
|
257
|
+
})
|
258
|
+
|
259
|
+
result = parser.generate(dataset)
|
260
|
+
print(result["answer"])
|
261
|
+
# ['Machine learning is a subset of AI.', 'Neural networks process data in layers.']
|
262
|
+
```
|
263
|
+
|
264
|
+
### Multiple Field Extraction
|
265
|
+
|
266
|
+
Extract multiple structured fields from a single response:
|
267
|
+
|
268
|
+
```python
|
269
|
+
# Extract multiple fields with tag pairs
|
270
|
+
parser = TextParserBlock(
|
271
|
+
block_name="extract_qa",
|
272
|
+
input_cols=["llm_response"],
|
273
|
+
output_cols=["question", "answer", "confidence"],
|
274
|
+
start_tags=["<question>", "<answer>", "<confidence>"],
|
275
|
+
end_tags=["</question>", "</answer>", "</confidence>"]
|
276
|
+
)
|
277
|
+
|
278
|
+
dataset = Dataset.from_dict({
|
279
|
+
"llm_response": [
|
280
|
+
"""
|
281
|
+
<question>What is Python?</question>
|
282
|
+
<answer>Python is a high-level programming language.</answer>
|
283
|
+
<confidence>0.95</confidence>
|
284
|
+
"""
|
285
|
+
]
|
286
|
+
})
|
287
|
+
|
288
|
+
result = parser.generate(dataset)
|
289
|
+
print(result["question"]) # ['What is Python?']
|
290
|
+
print(result["answer"]) # ['Python is a high-level programming language.']
|
291
|
+
print(result["confidence"]) # ['0.95']
|
292
|
+
```
|
293
|
+
|
294
|
+
### Custom Regex Parsing
|
295
|
+
|
296
|
+
Use regex patterns for flexible extraction:
|
297
|
+
|
298
|
+
```python
|
299
|
+
# Extract using regex pattern
|
300
|
+
parser = TextParserBlock(
|
301
|
+
block_name="regex_parser",
|
302
|
+
input_cols=["llm_response"],
|
303
|
+
output_cols=["answer"],
|
304
|
+
parsing_pattern=r"Answer:\s*(.+?)(?:\n|$)"
|
305
|
+
)
|
306
|
+
|
307
|
+
dataset = Dataset.from_dict({
|
308
|
+
"llm_response": [
|
309
|
+
"Question: What is AI?\nAnswer: Artificial Intelligence is...\n",
|
310
|
+
"Let me answer:\nAnswer: Machine learning enables..."
|
311
|
+
]
|
312
|
+
})
|
313
|
+
|
314
|
+
result = parser.generate(dataset)
|
315
|
+
print(result["answer"])
|
316
|
+
# ['Artificial Intelligence is...', 'Machine learning enables...']
|
317
|
+
```
|
318
|
+
|
319
|
+
### Tag Cleanup
|
320
|
+
|
321
|
+
Remove unwanted tags from extracted content:
|
322
|
+
|
323
|
+
```python
|
324
|
+
# Clean up markdown and code tags
|
325
|
+
parser = TextParserBlock(
|
326
|
+
block_name="clean_parser",
|
327
|
+
input_cols=["llm_response"],
|
328
|
+
output_cols=["clean_answer"],
|
329
|
+
start_tags=["<answer>"],
|
330
|
+
end_tags=["</answer>"],
|
331
|
+
parser_cleanup_tags=["```", "###", "**"]
|
332
|
+
)
|
333
|
+
|
334
|
+
dataset = Dataset.from_dict({
|
335
|
+
"llm_response": [
|
336
|
+
"<answer>Here's the code: ```python\nprint('hello')```</answer>",
|
337
|
+
"<answer>**Important**: This is the ### answer</answer>"
|
338
|
+
]
|
339
|
+
})
|
340
|
+
|
341
|
+
result = parser.generate(dataset)
|
342
|
+
print(result["clean_answer"])
|
343
|
+
# ['Here\'s the code: python\nprint(\'hello\')', 'Important: This is the answer']
|
344
|
+
```
|
345
|
+
|
346
|
+
### Handling Multiple Matches
|
347
|
+
|
348
|
+
Extract all occurrences of a pattern:
|
349
|
+
|
350
|
+
```python
|
351
|
+
parser = TextParserBlock(
|
352
|
+
block_name="multi_extract",
|
353
|
+
input_cols=["llm_response"],
|
354
|
+
output_cols=["keywords"],
|
355
|
+
start_tags=["[KEY]"],
|
356
|
+
end_tags=["[/KEY]"]
|
357
|
+
)
|
358
|
+
|
359
|
+
dataset = Dataset.from_dict({
|
360
|
+
"llm_response": [
|
361
|
+
"Important terms: [KEY]machine learning[/KEY], [KEY]neural networks[/KEY], [KEY]deep learning[/KEY]"
|
362
|
+
]
|
363
|
+
})
|
364
|
+
|
365
|
+
result = parser.generate(dataset)
|
366
|
+
print(result["keywords"])
|
367
|
+
# [['machine learning', 'neural networks', 'deep learning']]
|
368
|
+
```
|
369
|
+
|
370
|
+
### Practical Example: Evaluation Response Parsing
|
371
|
+
|
372
|
+
Common pattern for parsing LLM evaluation responses:
|
373
|
+
|
374
|
+
```python
|
375
|
+
# Parse structured evaluation output
|
376
|
+
evaluation_parser = TextParserBlock(
|
377
|
+
block_name="parse_evaluation",
|
378
|
+
input_cols=["evaluation_response"],
|
379
|
+
output_cols=["explanation", "judgment"],
|
380
|
+
start_tags=["[Start of Explanation]", "[Start of Answer]"],
|
381
|
+
end_tags=["[End of Explanation]", "[End of Answer]"],
|
382
|
+
parser_cleanup_tags=["```", "###"]
|
383
|
+
)
|
384
|
+
|
385
|
+
dataset = Dataset.from_dict({
|
386
|
+
"evaluation_response": [
|
387
|
+
"""
|
388
|
+
[Start of Explanation]
|
389
|
+
The response accurately reflects the information in the document.
|
390
|
+
No hallucinations or contradictions were found.
|
391
|
+
[End of Explanation]
|
392
|
+
|
393
|
+
[Start of Answer]
|
394
|
+
YES
|
395
|
+
[End of Answer]
|
396
|
+
"""
|
397
|
+
]
|
398
|
+
})
|
399
|
+
|
400
|
+
result = evaluation_parser.generate(dataset)
|
401
|
+
print(result["explanation"]) # ['The response accurately reflects...']
|
402
|
+
print(result["judgment"]) # ['YES']
|
403
|
+
```
|
404
|
+
|
405
|
+
### Integration with LLMChatBlock
|
406
|
+
|
407
|
+
TextParserBlock is commonly used after LLMChatBlock to structure responses:
|
408
|
+
|
409
|
+
```python
|
410
|
+
from sdg_hub.core.blocks import LLMChatBlock, LLMParserBlock, TextParserBlock
|
411
|
+
|
412
|
+
# Step 1: Generate LLM response
|
413
|
+
chat_block = LLMChatBlock(
|
414
|
+
block_name="evaluator",
|
415
|
+
model="openai/gpt-4o",
|
416
|
+
input_cols=["messages"],
|
417
|
+
output_cols=["eval_response"]
|
418
|
+
)
|
419
|
+
|
420
|
+
# Step 2: Extract content from response object
|
421
|
+
# Use field_prefix="" to get cleaner column names
|
422
|
+
llm_parser = LLMParserBlock(
|
423
|
+
block_name="extract_eval",
|
424
|
+
input_cols=["eval_response"],
|
425
|
+
extract_content=True,
|
426
|
+
field_prefix="eval_" # Results in "eval_content" instead of "extract_content"
|
427
|
+
)
|
428
|
+
|
429
|
+
# Step 3: Parse structured fields from text
|
430
|
+
text_parser = TextParserBlock(
|
431
|
+
block_name="parse_fields",
|
432
|
+
input_cols=["eval_content"],
|
433
|
+
output_cols=["score", "reasoning"],
|
434
|
+
start_tags=["[SCORE]", "[REASONING]"],
|
435
|
+
end_tags=["[/SCORE]", "[/REASONING]"]
|
436
|
+
)
|
437
|
+
|
438
|
+
# Execute in sequence (or use a Flow)
|
439
|
+
dataset = Dataset.from_dict({
|
440
|
+
"messages": [[{"role": "user", "content": "Evaluate this text..."}]]
|
441
|
+
})
|
442
|
+
|
443
|
+
result = chat_block.generate(dataset)
|
444
|
+
result = llm_parser.generate(result)
|
445
|
+
result = text_parser.generate(result)
|
446
|
+
|
447
|
+
print(result["score"]) # Extracted score
|
448
|
+
print(result["reasoning"]) # Extracted reasoning
|
449
|
+
```
|
450
|
+
|
451
|
+
### Configuration Reference
|
452
|
+
|
453
|
+
**Required Parameters:**
|
454
|
+
- `block_name` - Unique identifier for the block
|
455
|
+
- `input_cols` - Single column containing text to parse
|
456
|
+
- `output_cols` - List of field names for extracted content
|
457
|
+
|
458
|
+
**Parsing Methods (choose one):**
|
459
|
+
- **Tag-based**: `start_tags` + `end_tags` (must have same length as `output_cols`)
|
460
|
+
- **Regex**: `parsing_pattern` (single regex with capture groups)
|
461
|
+
|
462
|
+
**Optional Parameters:**
|
463
|
+
- `parser_cleanup_tags` - List of tags to remove from extracted text
|
464
|
+
- `expand_lists` - Whether to expand list inputs into rows (default: `True`)
|
465
|
+
|
466
|
+
**Tag Parsing Rules:**
|
467
|
+
- Number of tag pairs must match number of output columns
|
468
|
+
- Each tag pair extracts all matches for that field
|
469
|
+
- Tags can be any string (XML-style, markdown-style, custom)
|
470
|
+
- Missing tags result in empty lists for that field
|
237
471
|
|
238
472
|
## 🚀 Next Steps
|
239
473
|
|
@@ -292,34 +292,135 @@ print(f"Output columns: {dry_result['final_dataset']['columns']}")
|
|
292
292
|
print(f"Sample output: {dry_result['sample_output']}")
|
293
293
|
```
|
294
294
|
|
295
|
-
###
|
295
|
+
### Runtime Parameters
|
296
296
|
|
297
|
-
|
297
|
+
Runtime parameters allow you to customize block behavior at execution time without modifying flow YAML files. You can override global parameters for all blocks or configure specific blocks individually.
|
298
|
+
|
299
|
+
**Global Parameter Override:**
|
300
|
+
|
301
|
+
Apply parameters to all compatible blocks in the flow:
|
298
302
|
|
299
303
|
```python
|
300
|
-
# Override
|
304
|
+
# Override global parameters
|
301
305
|
result = flow.generate(
|
302
306
|
dataset,
|
303
307
|
runtime_params={
|
308
|
+
"temperature": 0.7,
|
304
309
|
"max_tokens": 200,
|
305
|
-
"
|
310
|
+
"top_p": 0.95
|
306
311
|
}
|
307
312
|
)
|
308
313
|
```
|
309
314
|
|
310
|
-
|
315
|
+
**Block-Specific Configuration:**
|
316
|
+
|
317
|
+
Target individual blocks by their `block_name` for fine-grained control:
|
311
318
|
|
312
|
-
|
319
|
+
```python
|
320
|
+
# Configure different parameters for each block
|
321
|
+
result = flow.generate(
|
322
|
+
dataset,
|
323
|
+
runtime_params={
|
324
|
+
# LLM blocks - control generation parameters
|
325
|
+
"question_generator": {
|
326
|
+
"temperature": 0.9,
|
327
|
+
"max_tokens": 100,
|
328
|
+
"top_p": 0.95,
|
329
|
+
"frequency_penalty": 0.5
|
330
|
+
},
|
331
|
+
"answer_generator": {
|
332
|
+
"temperature": 0.5,
|
333
|
+
"max_tokens": 300,
|
334
|
+
"presence_penalty": 0.3
|
335
|
+
},
|
336
|
+
|
337
|
+
# LLM parser blocks - configure extraction
|
338
|
+
"extract_eval_content": {
|
339
|
+
"extract_content": True,
|
340
|
+
"extract_reasoning_content": True,
|
341
|
+
"field_prefix": "llm_"
|
342
|
+
},
|
343
|
+
|
344
|
+
# Text parsing blocks - override parsing tags
|
345
|
+
"parse_evaluation": {
|
346
|
+
"start_tags": ["[Answer]", "[Explanation]", "[Score]"],
|
347
|
+
"end_tags": ["[/Answer]", "[/Explanation]", "[/Score]"],
|
348
|
+
"parser_cleanup_tags": ["```", "###", "---"]
|
349
|
+
},
|
350
|
+
|
351
|
+
# Filter blocks - adjust filter criteria
|
352
|
+
"quality_filter": {
|
353
|
+
"filter_value": 0.9,
|
354
|
+
"operation": "ge"
|
355
|
+
},
|
356
|
+
"faithfulness_filter": {
|
357
|
+
"filter_value": "YES",
|
358
|
+
"operation": "eq"
|
359
|
+
}
|
360
|
+
}
|
361
|
+
)
|
362
|
+
```
|
313
363
|
|
314
|
-
|
364
|
+
**Common Runtime Parameters by Block Type:**
|
365
|
+
|
366
|
+
| Block Type | Parameter | Description | Example Values |
|
367
|
+
|------------|-----------|-------------|----------------|
|
368
|
+
| **LLMChatBlock** | `temperature` | Control randomness in generation | `0.0` - `2.0` |
|
369
|
+
| | `max_tokens` | Maximum response length | `50`, `200`, `1000` |
|
370
|
+
| | `top_p` | Nucleus sampling threshold | `0.0` - `1.0` |
|
371
|
+
| | `frequency_penalty` | Penalize token repetition | `-2.0` - `2.0` |
|
372
|
+
| | `presence_penalty` | Penalize new topics | `-2.0` - `2.0` |
|
373
|
+
| **LLMParserBlock** | `extract_content` | Extract main content field | `True`, `False` |
|
374
|
+
| | `extract_reasoning_content` | Extract reasoning/thinking | `True`, `False` |
|
375
|
+
| | `extract_tool_calls` | Extract tool call data | `True`, `False` |
|
376
|
+
| | `field_prefix` | Prefix for output fields | `"llm_"`, `"parsed_"` |
|
377
|
+
| **TextParserBlock** | `start_tags` | Opening tags for extraction | `["<answer>", "[Q]"]` |
|
378
|
+
| | `end_tags` | Closing tags for extraction | `["</answer>", "[/Q]"]` |
|
379
|
+
| | `parsing_pattern` | Custom regex pattern | `r"Answer:\s*(.+)"` |
|
380
|
+
| | `parser_cleanup_tags` | Tags to remove from output | `["```", "###"]` |
|
381
|
+
| **ColumnValueFilterBlock** | `filter_value` | Value to filter by | `0.8`, `"YES"`, `[1, 2]` |
|
382
|
+
| | `operation` | Comparison operation | `"eq"`, `"gt"`, `"contains"` |
|
383
|
+
| | `convert_dtype` | Type conversion | `"float"`, `"int"` |
|
384
|
+
|
385
|
+
**Practical Examples:**
|
315
386
|
|
316
387
|
```python
|
317
|
-
#
|
388
|
+
# Experiment with different generation styles
|
389
|
+
result = flow.generate(
|
390
|
+
dataset,
|
391
|
+
runtime_params={
|
392
|
+
"temperature": 0.9, # More creative
|
393
|
+
"top_p": 0.95
|
394
|
+
}
|
395
|
+
)
|
396
|
+
|
397
|
+
# Adjust parsing for different prompt formats
|
398
|
+
result = flow.generate(
|
399
|
+
dataset,
|
400
|
+
runtime_params={
|
401
|
+
"text_parser": {
|
402
|
+
"start_tags": ["<thinking>", "<answer>"],
|
403
|
+
"end_tags": ["</thinking>", "</answer>"]
|
404
|
+
}
|
405
|
+
}
|
406
|
+
)
|
407
|
+
|
408
|
+
# Increase quality thresholds for production
|
409
|
+
result = flow.generate(
|
410
|
+
dataset,
|
411
|
+
runtime_params={
|
412
|
+
"quality_filter": {"filter_value": 0.95},
|
413
|
+
"relevancy_filter": {"filter_value": 0.90}
|
414
|
+
}
|
415
|
+
)
|
416
|
+
|
417
|
+
# Mix global and block-specific parameters
|
318
418
|
result = flow.generate(
|
319
|
-
dataset,
|
320
|
-
runtime_params
|
321
|
-
|
322
|
-
|
419
|
+
dataset,
|
420
|
+
runtime_params={
|
421
|
+
"temperature": 0.7, # Global default
|
422
|
+
"creative_generator": {"temperature": 1.0}, # Override for one block
|
423
|
+
"quality_filter": {"filter_value": 0.85}
|
323
424
|
}
|
324
425
|
)
|
325
426
|
```
|
@@ -369,6 +470,73 @@ result = flow.generate(dataset, max_concurrency=20)
|
|
369
470
|
result = flow.generate(dataset) # Default behavior
|
370
471
|
```
|
371
472
|
|
473
|
+
### Checkpointing
|
474
|
+
|
475
|
+
Flow checkpointing enables resuming interrupted executions by saving progress periodically. This is essential for long-running flows that process large datasets, preventing data loss from failures or interruptions.
|
476
|
+
|
477
|
+
**Basic Checkpointing:**
|
478
|
+
|
479
|
+
```python
|
480
|
+
# Enable checkpointing with automatic resume
|
481
|
+
result = flow.generate(
|
482
|
+
dataset,
|
483
|
+
checkpoint_dir="./my_flow_checkpoints",
|
484
|
+
save_freq=100 # Save every 100 completed samples
|
485
|
+
)
|
486
|
+
```
|
487
|
+
|
488
|
+
**How It Works:**
|
489
|
+
|
490
|
+
1. **Progress Tracking** - Flow saves completed samples to checkpoint files after every `save_freq` samples
|
491
|
+
2. **Automatic Resume** - On restart, Flow detects existing checkpoints and processes only remaining samples
|
492
|
+
3. **Final Merge** - Completed and newly processed samples are automatically combined in the final result
|
493
|
+
|
494
|
+
**Use Cases:**
|
495
|
+
|
496
|
+
- **Long-Running Flows** - Process thousands of samples safely over hours or days
|
497
|
+
- **Unreliable Infrastructure** - Protect against network failures, rate limits, or system crashes
|
498
|
+
- **Iterative Development** - Test and refine flows without reprocessing completed samples
|
499
|
+
- **Cost Management** - Avoid wasting API credits by restarting from failures
|
500
|
+
|
501
|
+
**Configuration Options:**
|
502
|
+
|
503
|
+
```python
|
504
|
+
# Save checkpoints every N samples (recommended for large datasets)
|
505
|
+
result = flow.generate(
|
506
|
+
dataset,
|
507
|
+
checkpoint_dir="./checkpoints",
|
508
|
+
save_freq=50 # Checkpoint after each 50 samples
|
509
|
+
)
|
510
|
+
|
511
|
+
# Only save final result (minimal overhead)
|
512
|
+
result = flow.generate(
|
513
|
+
dataset,
|
514
|
+
checkpoint_dir="./checkpoints"
|
515
|
+
# No save_freq - only saves at completion
|
516
|
+
)
|
517
|
+
|
518
|
+
# Combine with other execution features
|
519
|
+
result = flow.generate(
|
520
|
+
dataset,
|
521
|
+
checkpoint_dir="./checkpoints",
|
522
|
+
save_freq=100,
|
523
|
+
max_concurrency=10
|
524
|
+
)
|
525
|
+
```
|
526
|
+
|
527
|
+
**Checkpoint Structure:**
|
528
|
+
|
529
|
+
Checkpoint directories contain:
|
530
|
+
- `checkpoint_NNNN.jsonl` - Completed sample batches in JSONL format
|
531
|
+
- `flow_metadata.json` - Flow ID, progress counters, and validation data
|
532
|
+
|
533
|
+
**Important Notes:**
|
534
|
+
|
535
|
+
- Checkpoints are flow-specific using `flow_id` to prevent mixing incompatible data
|
536
|
+
- Remaining samples are identified by comparing input dataset with completed samples using common columns
|
537
|
+
- If all samples are completed, Flow skips processing and returns merged results immediately
|
538
|
+
- Clean up checkpoint directories manually when no longer needed
|
539
|
+
|
372
540
|
## 🚀 Next Steps
|
373
541
|
|
374
542
|
Ready to master the flow system? Explore these detailed guides:
|