sdg-hub 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/docs.yml +1 -1
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/integration-test.yml +3 -2
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/pypi.yaml +3 -3
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/PKG-INFO +15 -14
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +26 -17
- sdg_hub-0.7.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/knowledge_utils.py +12 -6
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/pyproject.toml +23 -16
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +9 -5
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/base.py +6 -1
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/PKG-INFO +15 -14
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/SOURCES.txt +1 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/requires.txt +3 -1
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_base.py +50 -4
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tox.ini +6 -4
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/dependabot.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/mergify.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/actionlint.dockerfile +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/packer.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.github/workflows/test.yml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.gitignore +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.isort.cfg +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/.pylintrc +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/CLAUDE.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/LICENSE +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/Makefile +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/.nojekyll +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_coverpage.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_navbar.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/_sidebar.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/api-reference.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/assets/logo.png +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/assets/sdg-hub-cover.png +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/llm-blocks.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/concepts.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/development.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/available-flows.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/custom-flows.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/flows/overview.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/index.html +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/installation.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/docs/quick-start.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/knowledge_generation_ja.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/rag_evaluation/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/rag_evaluation/rag_evaluation_dataset_generation.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/packer/centos.pkr.hcl +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/packer/setup-centos.sh +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/ruff.sh +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/scripts/snyk_notebook_scan.sh +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/setup.cfg +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/base.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/flow_metrics.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/time_estimator.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/answer_generation.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/conceptual_qa_generation.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/context_extraction.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/groundedness_critic.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/question_evolution.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/evaluation/rag/topic_generation.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_llm_parser_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/test_base_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_json_structure_block.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_rename_columns.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_integration.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_time_estimation.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_flow_metrics.py +0 -0
- {sdg_hub-0.7.0 → sdg_hub-0.7.2}/tests/utils/test_path_resolution.py +0 -0
|
@@ -39,6 +39,6 @@ jobs:
|
|
|
39
39
|
- name: "Checkout"
|
|
40
40
|
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
|
41
41
|
- name: "Check Markdown documents"
|
|
42
|
-
uses: DavidAnson/markdownlint-cli2-action@
|
|
42
|
+
uses: DavidAnson/markdownlint-cli2-action@07035fd053f7be764496c0f8d8f9f41f98305101 # v22.0.0
|
|
43
43
|
with:
|
|
44
44
|
globs: '**/*.md'
|
|
@@ -112,7 +112,7 @@ jobs:
|
|
|
112
112
|
|
|
113
113
|
|
|
114
114
|
- name: Cache huggingface datasets
|
|
115
|
-
uses: actions/cache@
|
|
115
|
+
uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
|
|
116
116
|
with:
|
|
117
117
|
path: ~/.cache/huggingface
|
|
118
118
|
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
|
@@ -127,6 +127,7 @@ jobs:
|
|
|
127
127
|
env:
|
|
128
128
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
129
129
|
run: |
|
|
130
|
+
# Uses .[dev,integration] - lightweight, no torch/transformers
|
|
130
131
|
tox -e py3-integrationcov
|
|
131
132
|
|
|
132
133
|
|
|
@@ -139,7 +140,7 @@ jobs:
|
|
|
139
140
|
flags: integration
|
|
140
141
|
|
|
141
142
|
- name: Upload integration test artifacts
|
|
142
|
-
uses: actions/upload-artifact@
|
|
143
|
+
uses: actions/upload-artifact@v6
|
|
143
144
|
if: always()
|
|
144
145
|
with:
|
|
145
146
|
name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
|
|
@@ -72,7 +72,7 @@ jobs:
|
|
|
72
72
|
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
|
|
73
73
|
|
|
74
74
|
- name: "Download build artifacts"
|
|
75
|
-
uses: actions/download-artifact@
|
|
75
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
76
76
|
with:
|
|
77
77
|
name: Packages
|
|
78
78
|
path: dist
|
|
@@ -104,13 +104,13 @@ jobs:
|
|
|
104
104
|
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
|
|
105
105
|
|
|
106
106
|
- name: "Download build artifacts"
|
|
107
|
-
uses: actions/download-artifact@
|
|
107
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
108
108
|
with:
|
|
109
109
|
name: Packages
|
|
110
110
|
path: dist
|
|
111
111
|
|
|
112
112
|
- name: "Sigstore sign package"
|
|
113
|
-
uses: sigstore/gh-action-sigstore-python@
|
|
113
|
+
uses: sigstore/gh-action-sigstore-python@a5caf349bc536fbef3668a10ed7f5cd309a4b53d # v3.2.0
|
|
114
114
|
with:
|
|
115
115
|
inputs: |
|
|
116
116
|
./dist/*.tar.gz
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -33,6 +33,20 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
|
33
33
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
|
34
34
|
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
|
35
35
|
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
|
38
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
|
39
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-html; extra == "dev"
|
|
44
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff; extra == "dev"
|
|
46
|
+
Requires-Dist: pytest-env; extra == "dev"
|
|
47
|
+
Requires-Dist: nbconvert>=7.0.0; extra == "dev"
|
|
48
|
+
Provides-Extra: integration
|
|
49
|
+
Requires-Dist: nest-asyncio; extra == "integration"
|
|
36
50
|
Provides-Extra: examples
|
|
37
51
|
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
|
38
52
|
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
|
@@ -46,20 +60,7 @@ Requires-Dist: nltk; extra == "examples"
|
|
|
46
60
|
Requires-Dist: sentence-transformers; extra == "examples"
|
|
47
61
|
Requires-Dist: instructor; extra == "examples"
|
|
48
62
|
Requires-Dist: fastapi; extra == "examples"
|
|
49
|
-
Requires-Dist: nest-asyncio; extra == "examples"
|
|
50
63
|
Requires-Dist: ipykernel; extra == "examples"
|
|
51
|
-
Provides-Extra: dev
|
|
52
|
-
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
|
53
|
-
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
|
54
|
-
Requires-Dist: pylint-pydantic; extra == "dev"
|
|
55
|
-
Requires-Dist: pytest; extra == "dev"
|
|
56
|
-
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
57
|
-
Requires-Dist: pytest-cov; extra == "dev"
|
|
58
|
-
Requires-Dist: pytest-html; extra == "dev"
|
|
59
|
-
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
|
60
|
-
Requires-Dist: ruff; extra == "dev"
|
|
61
|
-
Requires-Dist: pytest-env; extra == "dev"
|
|
62
|
-
Requires-Dist: nbconvert>=7.0.0; extra == "dev"
|
|
63
64
|
Dynamic: license-file
|
|
64
65
|
|
|
65
66
|
# `sdg_hub`: Synthetic Data Generation Toolkit
|
|
@@ -48,29 +48,38 @@ Only claims passing this check are retained. This process filters out **hallucin
|
|
|
48
48
|
|
|
49
49
|
---
|
|
50
50
|
|
|
51
|
-
## Data Generation Statistics
|
|
51
|
+
## Data Generation Statistics and Results
|
|
52
|
+
|
|
53
|
+
**Teacher model for generation:** `openai/gpt-oss-120b`
|
|
54
|
+
**Student model trained:** `meta-llama/Llama-3.1-8B-Instruct`
|
|
55
|
+
**Training method:** Supervised Fine-Tuning (SFT)
|
|
56
|
+
|
|
57
|
+
---
|
|
52
58
|
|
|
53
59
|
### Summary Augmentation
|
|
54
60
|
|
|
55
|
-
Each “cut” represents the total number of
|
|
61
|
+
For each document, we generate three augmentation types—detailed summaries, extractive summaries, and atomic facts. Each “cut” on the table below represents the total number of summary augmentations per document (i.e., how many times each augmentation process is run).
|
|
56
62
|
|
|
57
|
-
| Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count
|
|
58
|
-
| ------------------------------- |
|
|
59
|
-
|
|
|
60
|
-
|
|
|
61
|
-
|
|
|
62
|
-
|
|
|
63
|
-
|
|
|
64
|
-
|
|
|
65
|
-
| 40 | 87,118,308 |
|
|
66
|
-
| 50 | 108,779,213 |
|
|
63
|
+
| Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
|
|
64
|
+
| ------------------------------- | ------------- |
|
|
65
|
+
| Input Corpus | 1,517,465 |
|
|
66
|
+
| 10 | 87,248,889 |
|
|
67
|
+
| 20 | 158,615,276 |
|
|
68
|
+
| 30 | 230,306,195 |
|
|
69
|
+
| 40 | 301,805,906 |
|
|
70
|
+
| 50 | 373,183,414 |
|
|
67
71
|
|
|
68
72
|
---
|
|
69
73
|
|
|
70
|
-
###
|
|
74
|
+
### Benchmark Results
|
|
71
75
|
|
|
72
|
-
|
|
76
|
+
- **Evaluation benchmark:** [QuALITY benchmark](https://nyu-mll.github.io/quality/)
|
|
77
|
+
- **Evaluation script & metric:** [Synthetic_Continued_Pretraining](https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/evaluation.py), Exact Match (EM)
|
|
78
|
+
- **Student model:** meta-llama/Llama-3.1-8B-Instruct (after SFT on generated/augmented summaries)
|
|
79
|
+
- **Performance metric:** Model accuracy
|
|
73
80
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
81
|
+

|
|
82
|
+
|
|
83
|
+
*Figure: Model accuracy across the QuALITY benchmark datasets, comparing SFT training on enhanced document summaries with the original model performance.*
|
|
84
|
+
|
|
85
|
+
---
|
|
Binary file
|
|
@@ -602,13 +602,14 @@ def _num_chars_from_tokens(num_tokens) -> int:
|
|
|
602
602
|
return int(num_tokens * 4) # 1 token ~ 4 English character
|
|
603
603
|
|
|
604
604
|
|
|
605
|
-
def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
|
|
605
|
+
def chunk_document(documents: List, server_ctx_size, chunk_word_count, **kwargs) -> List[str]:
|
|
606
606
|
"""
|
|
607
607
|
Iterates over the documents and splits them into chunks based on the word count provided by the user.
|
|
608
608
|
Args:
|
|
609
609
|
documents (list): List of documents retrieved from git (can also consist of a single document).
|
|
610
610
|
server_ctx_size (int): Context window size of server.
|
|
611
611
|
chunk_word_count (int): Maximum number of words to chunk a document.
|
|
612
|
+
chunk_overlap (int): Overlap in characters between chunks.
|
|
612
613
|
Returns:
|
|
613
614
|
List[str]: List of chunked documents.
|
|
614
615
|
"""
|
|
@@ -634,7 +635,7 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
|
|
|
634
635
|
# Placeholder for params
|
|
635
636
|
content = []
|
|
636
637
|
chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
|
|
637
|
-
chunk_overlap = _DEFAULT_CHUNK_OVERLAP
|
|
638
|
+
chunk_overlap = int(kwargs.pop("chunk_overlap", str(_DEFAULT_CHUNK_OVERLAP)))
|
|
638
639
|
|
|
639
640
|
# Using Markdown as default, document-specific chunking will be implemented in seperate pr.
|
|
640
641
|
text_splitter = RecursiveCharacterTextSplitter.from_language(
|
|
@@ -729,16 +730,21 @@ class DocProcessor:
|
|
|
729
730
|
}
|
|
730
731
|
)
|
|
731
732
|
|
|
732
|
-
def _add_icls(self, chunked_document: Dataset) -> Dataset:
|
|
733
|
+
def _add_icls(self, chunked_document: Dataset, **kwargs) -> Dataset:
|
|
733
734
|
"""
|
|
734
735
|
Add the ICLS label to the dataset.
|
|
735
736
|
Args:
|
|
736
737
|
dataset (Dataset): Dataset object.
|
|
738
|
+
server_ctx_size (int): Context window size of server.
|
|
739
|
+
chunk_word_count (int): Maximum number of words to chunk a document.
|
|
740
|
+
chunk_overlap (int): Overlap in characters between chunks.
|
|
737
741
|
|
|
738
742
|
Returns
|
|
739
743
|
-------
|
|
740
744
|
Dataset: Dataset object with ICLS label.
|
|
741
745
|
"""
|
|
746
|
+
server_ctx_size = int(kwargs.pop("server_ctx_size", "4096"))
|
|
747
|
+
chunk_word_count = int(kwargs.pop("chunk_word_count", "1024"))
|
|
742
748
|
icl = self.user_config["seed_examples"]
|
|
743
749
|
chunked_document_all_icl = []
|
|
744
750
|
for icl_ in icl:
|
|
@@ -762,7 +768,7 @@ class DocProcessor:
|
|
|
762
768
|
chunked_document_all_icl = chunked_document_all_icl.map(
|
|
763
769
|
lambda x: {
|
|
764
770
|
"chunks": chunk_document(
|
|
765
|
-
[x["document"]], server_ctx_size=
|
|
771
|
+
[x["document"]], server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, **kwargs
|
|
766
772
|
)
|
|
767
773
|
if get_token_count(x["document"], self.tokenizer) > 1024
|
|
768
774
|
else [x["document"]]
|
|
@@ -797,7 +803,7 @@ class DocProcessor:
|
|
|
797
803
|
df = safe_concatenate_datasets([ds.to_pandas() for ds in datasets])
|
|
798
804
|
return Dataset.from_pandas(df) if df is not None else None
|
|
799
805
|
|
|
800
|
-
def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
|
|
806
|
+
def get_processed_markdown_dataset(self, list_md_files: list[Path], **kwargs) -> Dataset:
|
|
801
807
|
chunks_mds = []
|
|
802
808
|
for md_file in list_md_files:
|
|
803
809
|
with open(md_file, "r", encoding="utf-8") as f:
|
|
@@ -811,5 +817,5 @@ class DocProcessor:
|
|
|
811
817
|
}
|
|
812
818
|
)
|
|
813
819
|
chunk_ds = Dataset.from_list(chunks_mds)
|
|
814
|
-
chunk_ds_with_icls = self._add_icls(chunk_ds)
|
|
820
|
+
chunk_ds_with_icls = self._add_icls(chunk_ds, **kwargs)
|
|
815
821
|
return chunk_ds_with_icls
|
|
@@ -51,22 +51,7 @@ source = "https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub"
|
|
|
51
51
|
issues = "https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues"
|
|
52
52
|
|
|
53
53
|
[project.optional-dependencies]
|
|
54
|
-
|
|
55
|
-
"tabulate>=0.9.0",
|
|
56
|
-
"transformers>=4.37.0",
|
|
57
|
-
"langchain-text-splitters",
|
|
58
|
-
"docling>=2.3.0",
|
|
59
|
-
"scikit-learn",
|
|
60
|
-
"polars",
|
|
61
|
-
"matplotlib",
|
|
62
|
-
"spacy",
|
|
63
|
-
"nltk",
|
|
64
|
-
"sentence-transformers",
|
|
65
|
-
"instructor",
|
|
66
|
-
"fastapi",
|
|
67
|
-
"nest-asyncio",
|
|
68
|
-
"ipykernel",
|
|
69
|
-
]
|
|
54
|
+
# Development and testing dependencies (lightweight, no ML libraries)
|
|
70
55
|
dev = [
|
|
71
56
|
"pre-commit>=3.0.4,<4.0",
|
|
72
57
|
"pylint>=2.16.2,<4.0",
|
|
@@ -81,6 +66,28 @@ dev = [
|
|
|
81
66
|
# Integration testing dependencies
|
|
82
67
|
"nbconvert>=7.0.0",
|
|
83
68
|
]
|
|
69
|
+
# Minimal dependencies for integration testing only
|
|
70
|
+
# Integration tests run knowledge_generation.ipynb which only needs nest-asyncio
|
|
71
|
+
integration = [
|
|
72
|
+
"nest-asyncio",
|
|
73
|
+
]
|
|
74
|
+
# Heavy dependencies for example notebooks (knowledge_mixing.ipynb, etc.)
|
|
75
|
+
# NOT required for core functionality testing or integration tests
|
|
76
|
+
examples = [
|
|
77
|
+
"tabulate>=0.9.0",
|
|
78
|
+
"transformers>=4.37.0", # For knowledge_mixing.ipynb, NOT integration tests
|
|
79
|
+
"langchain-text-splitters",
|
|
80
|
+
"docling>=2.3.0", # For document parsing examples
|
|
81
|
+
"scikit-learn", # For raft_builder.py utility
|
|
82
|
+
"polars", # For knowledge_mixing_utils.py
|
|
83
|
+
"matplotlib",
|
|
84
|
+
"spacy",
|
|
85
|
+
"nltk",
|
|
86
|
+
"sentence-transformers",
|
|
87
|
+
"instructor",
|
|
88
|
+
"fastapi",
|
|
89
|
+
"ipykernel",
|
|
90
|
+
]
|
|
84
91
|
|
|
85
92
|
[tool.setuptools_scm]
|
|
86
93
|
version_file = "src/sdg_hub/_version.py"
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.7.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 7,
|
|
31
|
+
__version__ = version = '0.7.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 7, 2)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g99a40a268'
|
|
@@ -6,7 +6,8 @@ from typing import Any, Optional
|
|
|
6
6
|
import asyncio
|
|
7
7
|
|
|
8
8
|
from litellm import acompletion, completion
|
|
9
|
-
from pydantic import ConfigDict, Field, field_validator
|
|
9
|
+
from pydantic import ConfigDict, Field, SecretStr, field_validator
|
|
10
|
+
from tqdm.asyncio import tqdm_asyncio
|
|
10
11
|
import litellm
|
|
11
12
|
|
|
12
13
|
# Third Party
|
|
@@ -52,8 +53,9 @@ class LLMChatBlock(BaseBlock):
|
|
|
52
53
|
model : Optional[str], optional
|
|
53
54
|
Model identifier in LiteLLM format. Can be set later via flow.set_model_config().
|
|
54
55
|
Examples: "openai/gpt-4", "anthropic/claude-3-sonnet-20240229"
|
|
55
|
-
api_key : Optional[
|
|
56
|
+
api_key : Optional[SecretStr], optional
|
|
56
57
|
API key for the provider. Falls back to environment variables.
|
|
58
|
+
Automatically redacted in logs and string representations.
|
|
57
59
|
api_base : Optional[str], optional
|
|
58
60
|
Base URL for the API. Required for local models.
|
|
59
61
|
async_mode : bool, optional
|
|
@@ -97,7 +99,7 @@ class LLMChatBlock(BaseBlock):
|
|
|
97
99
|
model: Optional[str] = Field(
|
|
98
100
|
None, exclude=True, description="Model identifier in LiteLLM format"
|
|
99
101
|
)
|
|
100
|
-
api_key: Optional[
|
|
102
|
+
api_key: Optional[SecretStr] = Field(
|
|
101
103
|
None, exclude=True, description="API key for the provider"
|
|
102
104
|
)
|
|
103
105
|
api_base: Optional[str] = Field(
|
|
@@ -301,7 +303,7 @@ class LLMChatBlock(BaseBlock):
|
|
|
301
303
|
if self.model is not None:
|
|
302
304
|
completion_kwargs["model"] = self.model
|
|
303
305
|
if self.api_key is not None:
|
|
304
|
-
completion_kwargs["api_key"] = self.api_key
|
|
306
|
+
completion_kwargs["api_key"] = self.api_key.get_secret_value()
|
|
305
307
|
if self.api_base is not None:
|
|
306
308
|
completion_kwargs["api_base"] = self.api_base
|
|
307
309
|
if self.timeout is not None:
|
|
@@ -501,7 +503,9 @@ class LLMChatBlock(BaseBlock):
|
|
|
501
503
|
for messages in messages_list
|
|
502
504
|
]
|
|
503
505
|
|
|
504
|
-
responses = await
|
|
506
|
+
responses = await tqdm_asyncio.gather(
|
|
507
|
+
*tasks, desc=self.block_name, unit="req"
|
|
508
|
+
)
|
|
505
509
|
return responses
|
|
506
510
|
|
|
507
511
|
except Exception as e:
|
|
@@ -13,6 +13,7 @@ from pydantic import (
|
|
|
13
13
|
ConfigDict,
|
|
14
14
|
Field,
|
|
15
15
|
PrivateAttr,
|
|
16
|
+
SecretStr,
|
|
16
17
|
field_validator,
|
|
17
18
|
model_validator,
|
|
18
19
|
)
|
|
@@ -793,7 +794,10 @@ class Flow(BaseModel):
|
|
|
793
794
|
if api_base is not None:
|
|
794
795
|
config_params["api_base"] = api_base
|
|
795
796
|
if api_key is not None:
|
|
796
|
-
|
|
797
|
+
# Convert string api_key to SecretStr for automatic redaction in logs
|
|
798
|
+
config_params["api_key"] = (
|
|
799
|
+
SecretStr(api_key) if isinstance(api_key, str) else api_key
|
|
800
|
+
)
|
|
797
801
|
|
|
798
802
|
# Add any additional kwargs (temperature, max_tokens, etc.)
|
|
799
803
|
config_params.update(kwargs)
|
|
@@ -855,6 +859,7 @@ class Flow(BaseModel):
|
|
|
855
859
|
|
|
856
860
|
if modified_count > 0:
|
|
857
861
|
# Enhanced logging showing what was configured
|
|
862
|
+
# Note: SecretStr values automatically display as '**********' in logs
|
|
858
863
|
param_summary = []
|
|
859
864
|
for param_name, param_value in config_params.items():
|
|
860
865
|
if param_name == "model":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -33,6 +33,20 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
|
33
33
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
|
34
34
|
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
|
35
35
|
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
|
38
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
|
39
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-html; extra == "dev"
|
|
44
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff; extra == "dev"
|
|
46
|
+
Requires-Dist: pytest-env; extra == "dev"
|
|
47
|
+
Requires-Dist: nbconvert>=7.0.0; extra == "dev"
|
|
48
|
+
Provides-Extra: integration
|
|
49
|
+
Requires-Dist: nest-asyncio; extra == "integration"
|
|
36
50
|
Provides-Extra: examples
|
|
37
51
|
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
|
38
52
|
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
|
@@ -46,20 +60,7 @@ Requires-Dist: nltk; extra == "examples"
|
|
|
46
60
|
Requires-Dist: sentence-transformers; extra == "examples"
|
|
47
61
|
Requires-Dist: instructor; extra == "examples"
|
|
48
62
|
Requires-Dist: fastapi; extra == "examples"
|
|
49
|
-
Requires-Dist: nest-asyncio; extra == "examples"
|
|
50
63
|
Requires-Dist: ipykernel; extra == "examples"
|
|
51
|
-
Provides-Extra: dev
|
|
52
|
-
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
|
53
|
-
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
|
54
|
-
Requires-Dist: pylint-pydantic; extra == "dev"
|
|
55
|
-
Requires-Dist: pytest; extra == "dev"
|
|
56
|
-
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
57
|
-
Requires-Dist: pytest-cov; extra == "dev"
|
|
58
|
-
Requires-Dist: pytest-html; extra == "dev"
|
|
59
|
-
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
|
60
|
-
Requires-Dist: ruff; extra == "dev"
|
|
61
|
-
Requires-Dist: pytest-env; extra == "dev"
|
|
62
|
-
Requires-Dist: nbconvert>=7.0.0; extra == "dev"
|
|
63
64
|
Dynamic: license-file
|
|
64
65
|
|
|
65
66
|
# `sdg_hub`: Synthetic Data Generation Toolkit
|
|
@@ -54,6 +54,7 @@ examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation
|
|
|
54
54
|
examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb
|
|
55
55
|
examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py
|
|
56
56
|
examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py
|
|
57
|
+
examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png
|
|
57
58
|
examples/knowledge_tuning/instructlab/.gitignore
|
|
58
59
|
examples/knowledge_tuning/instructlab/README.md
|
|
59
60
|
examples/knowledge_tuning/instructlab/docling_v2_config.yaml
|
|
@@ -527,13 +527,15 @@ class TestFlow:
|
|
|
527
527
|
):
|
|
528
528
|
"""Create a mock LLM block with model attributes."""
|
|
529
529
|
# First Party
|
|
530
|
+
from pydantic import SecretStr
|
|
530
531
|
from tests.flow.conftest import MockBlock
|
|
531
532
|
|
|
532
533
|
block = MockBlock(block_name=name, input_cols=["input"], output_cols=["output"])
|
|
533
534
|
# Add LLM-related attributes
|
|
534
535
|
block.model = model
|
|
535
536
|
block.api_base = api_base
|
|
536
|
-
|
|
537
|
+
# Convert api_key to SecretStr to match real LLM blocks
|
|
538
|
+
block.api_key = SecretStr(api_key) if isinstance(api_key, str) else api_key
|
|
537
539
|
block.temperature = 0.0
|
|
538
540
|
block.max_tokens = 1024
|
|
539
541
|
return block
|
|
@@ -637,7 +639,7 @@ class TestFlow:
|
|
|
637
639
|
# Check that LLM blocks were modified
|
|
638
640
|
assert flow.blocks[1].model == "new-model" # llm_block1
|
|
639
641
|
assert flow.blocks[1].api_base == "http://localhost:8101/v1"
|
|
640
|
-
assert flow.blocks[1].api_key == "NEW_KEY"
|
|
642
|
+
assert flow.blocks[1].api_key.get_secret_value() == "NEW_KEY"
|
|
641
643
|
assert flow.blocks[1].temperature == 0.7
|
|
642
644
|
assert flow.blocks[1].max_tokens == 2048
|
|
643
645
|
|
|
@@ -696,7 +698,7 @@ class TestFlow:
|
|
|
696
698
|
|
|
697
699
|
# Other parameters should remain unchanged
|
|
698
700
|
assert flow.blocks[0].api_base == "http://localhost:8000/v1"
|
|
699
|
-
assert flow.blocks[0].api_key == "OLD_KEY"
|
|
701
|
+
assert flow.blocks[0].api_key.get_secret_value() == "OLD_KEY"
|
|
700
702
|
assert flow.blocks[0].max_tokens == 1024
|
|
701
703
|
|
|
702
704
|
def test_set_model_config_with_kwargs(self):
|
|
@@ -793,7 +795,7 @@ class TestFlow:
|
|
|
793
795
|
|
|
794
796
|
# Everything else should remain the same
|
|
795
797
|
assert flow.blocks[0].api_base == "http://localhost:8000/v1"
|
|
796
|
-
assert flow.blocks[0].api_key == "ORIGINAL_KEY"
|
|
798
|
+
assert flow.blocks[0].api_key.get_secret_value() == "ORIGINAL_KEY"
|
|
797
799
|
assert flow.blocks[0].temperature == 0.5
|
|
798
800
|
assert flow.blocks[0].max_tokens == 1024
|
|
799
801
|
assert flow.blocks[0].custom_param == "custom_value"
|
|
@@ -1422,3 +1424,47 @@ class TestFlow:
|
|
|
1422
1424
|
FlowValidationError, match="max_concurrency must be greater than 0"
|
|
1423
1425
|
):
|
|
1424
1426
|
flow.generate(dataset, max_concurrency=-1)
|
|
1427
|
+
|
|
1428
|
+
def test_set_model_config_redacts_sensitive_params(self, caplog):
|
|
1429
|
+
"""Test API key and secrets redaction in logs using Pydantic SecretStr.
|
|
1430
|
+
|
|
1431
|
+
Verifies that sensitive parameters (api_key) are automatically redacted
|
|
1432
|
+
by SecretStr while non-sensitive ones remain visible.
|
|
1433
|
+
"""
|
|
1434
|
+
# Standard
|
|
1435
|
+
import logging
|
|
1436
|
+
|
|
1437
|
+
# First Party
|
|
1438
|
+
from sdg_hub.core.blocks.llm.llm_chat_block import LLMChatBlock
|
|
1439
|
+
|
|
1440
|
+
llm_block = LLMChatBlock(
|
|
1441
|
+
block_name="test_llm", input_cols="messages", output_cols="response"
|
|
1442
|
+
)
|
|
1443
|
+
flow = Flow(metadata=self.test_metadata, blocks=[llm_block])
|
|
1444
|
+
|
|
1445
|
+
with caplog.at_level(logging.INFO, logger="sdg_hub.core.flow.base"):
|
|
1446
|
+
flow.set_model_config(
|
|
1447
|
+
model="openai/gpt-4",
|
|
1448
|
+
api_key="sk-secret-key",
|
|
1449
|
+
temperature=0.7,
|
|
1450
|
+
max_tokens=100,
|
|
1451
|
+
)
|
|
1452
|
+
|
|
1453
|
+
log_messages = [record.message for record in caplog.records]
|
|
1454
|
+
relevant_logs = [
|
|
1455
|
+
msg for msg in log_messages if "Successfully configured" in msg
|
|
1456
|
+
]
|
|
1457
|
+
assert len(relevant_logs) > 0
|
|
1458
|
+
log_text = relevant_logs[0]
|
|
1459
|
+
|
|
1460
|
+
# Sensitive params must be redacted - SecretStr displays as '**********'
|
|
1461
|
+
assert "**********" in log_text or "SecretStr" in log_text
|
|
1462
|
+
assert "sk-secret-key" not in log_text
|
|
1463
|
+
|
|
1464
|
+
# Non-sensitive params should be visible
|
|
1465
|
+
assert "temperature: 0.7" in log_text
|
|
1466
|
+
assert "max_tokens: 100" in log_text
|
|
1467
|
+
|
|
1468
|
+
# Verify that the api_key was actually set as a SecretStr on the block
|
|
1469
|
+
assert llm_block.api_key is not None
|
|
1470
|
+
assert llm_block.api_key.get_secret_value() == "sk-secret-key"
|
|
@@ -22,28 +22,30 @@ commands =
|
|
|
22
22
|
integration: {envpython} -m pytest {posargs:tests/integration}
|
|
23
23
|
|
|
24
24
|
# Integration test environment - runs notebook-based integration tests
|
|
25
|
+
# Lightweight: only requires nest-asyncio, NO torch/transformers needed
|
|
25
26
|
[testenv:py3-integration]
|
|
26
|
-
description = run integration tests (notebooks)
|
|
27
|
+
description = run integration tests (notebooks) - lightweight, no torch/transformers needed
|
|
27
28
|
package = wheel
|
|
28
29
|
wheel_build_env = pkg
|
|
29
30
|
passenv =
|
|
30
31
|
OPENAI_API_KEY
|
|
31
32
|
deps =
|
|
32
33
|
.[dev]
|
|
33
|
-
.[
|
|
34
|
+
.[integration]
|
|
34
35
|
commands =
|
|
35
36
|
{envpython} -m pytest {posargs:tests/integration -v -s}
|
|
36
37
|
|
|
37
38
|
# Integration test environment with coverage - runs notebook-based integration tests with coverage collection
|
|
39
|
+
# Lightweight: only requires nest-asyncio, NO torch/transformers needed
|
|
38
40
|
[testenv:py3-integrationcov]
|
|
39
|
-
description = run integration tests (notebooks) with coverage
|
|
41
|
+
description = run integration tests (notebooks) with coverage - lightweight, no torch/transformers needed
|
|
40
42
|
package = wheel
|
|
41
43
|
wheel_build_env = pkg
|
|
42
44
|
passenv =
|
|
43
45
|
OPENAI_API_KEY
|
|
44
46
|
deps =
|
|
45
47
|
.[dev]
|
|
46
|
-
.[
|
|
48
|
+
.[integration]
|
|
47
49
|
commands =
|
|
48
50
|
{envpython} -m pytest --cov=sdg_hub --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html tests/integration {posargs:-v -s}
|
|
49
51
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|