sdg-hub 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/actionlint.dockerfile +1 -1
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/docs.yml +1 -1
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/integration-test.yml +2 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/pypi.yaml +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/PKG-INFO +2 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/llm-blocks.md +2 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/overview.md +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +26 -17
- sdg_hub-0.7.3/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/knowledge_utils.py +12 -6
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/structured_insights_demo.ipynb +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/pyproject.toml +1 -1
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/_version.py +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/__init__.py +9 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/base.py +4 -1
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/__init__.py +3 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +11 -5
- sdg_hub-0.7.1/src/sdg_hub/core/blocks/llm/llm_parser_block.py → sdg_hub-0.7.3/src/sdg_hub/core/blocks/llm/llm_response_extractor_block.py +32 -9
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/text_parser_block.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/json_structure_block.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/melt_columns.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/rename_columns.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/text_concat.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +2 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/base.py +13 -32
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_metrics.py +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/flow.yaml +6 -6
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +4 -4
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +3 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +4 -4
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +2 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +7 -7
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +7 -7
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +4 -4
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/PKG-INFO +2 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/SOURCES.txt +3 -2
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/requires.txt +1 -1
- sdg_hub-0.7.1/tests/blocks/llm/test_llm_parser_block.py → sdg_hub-0.7.3/tests/blocks/llm/test_llm_response_extractor_block.py +55 -52
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_promptbuilderblock.py +1 -1
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/test_base_block.py +4 -3
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_base.py +78 -4
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_flow_metrics.py +11 -11
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/actionlint.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/actions/free-disk-space/action.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/dependabot.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/mergify.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/actionlint.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/lint.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/matchers/actionlint.json +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/matchers/pylint.json +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/packer.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/test.yml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.gitignore +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.isort.cfg +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.markdownlint-cli2.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.pre-commit-config.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.pylintrc +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/CLAUDE.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/CONTRIBUTING.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/LICENSE +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/Makefile +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/.nojekyll +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_coverpage.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_navbar.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_sidebar.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/api-reference.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/assets/logo.png +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/assets/sdg-hub-cover.png +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/custom-blocks.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/filtering-blocks.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/overview.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/transform-blocks.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/concepts.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/development.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/available-flows.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/custom-flows.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/discovery.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/index.html +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/installation.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/quick-start.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/knowledge_generation_ja.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/rag_evaluation/ibm-annual-report-2024.pdf +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/rag_evaluation/rag_evaluation_dataset_generation.ipynb +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/packer/centos.pkr.hcl +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/packer/setup-centos.sh +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/ruff.sh +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/snyk_notebook_scan.sh +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/setup.cfg +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/registry.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/checkpointer.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/metadata.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/registry.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/validation.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/datautils.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/error_handling.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/logger_config.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/path_resolution.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/time_estimator.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/answer_generation.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/conceptual_qa_generation.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/context_extraction.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/groundedness_critic.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/question_evolution.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/topic_generation.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/py.typed +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/top_level.txt +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_llm_chat_block.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_textparserblock.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/test_registry.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_config.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_index_based_mapper.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_json_structure_block.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_melt_columns.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_rename_columns.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_text_concat.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/conftest.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_checkpointer.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_dataset_requirements.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_integration.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_metadata.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_registry.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_time_estimation.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_validation.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_datautils.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_error_handling.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_path_resolution.py +0 -0
- {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tox.ini +0 -0
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# Since dependabot cannot update workflows using docker,
|
|
2
2
|
# we use this indirection since dependabot can update this file.
|
|
3
|
-
FROM rhysd/actionlint:1.7.
|
|
3
|
+
FROM rhysd/actionlint:1.7.10@sha256:ef8299f97635c4c30e2298f48f30763ab782a4ad2c95b744649439a039421e36
|
|
@@ -39,6 +39,6 @@ jobs:
|
|
|
39
39
|
- name: "Checkout"
|
|
40
40
|
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
|
|
41
41
|
- name: "Check Markdown documents"
|
|
42
|
-
uses: DavidAnson/markdownlint-cli2-action@
|
|
42
|
+
uses: DavidAnson/markdownlint-cli2-action@07035fd053f7be764496c0f8d8f9f41f98305101 # v22.0.0
|
|
43
43
|
with:
|
|
44
44
|
globs: '**/*.md'
|
|
@@ -112,7 +112,7 @@ jobs:
|
|
|
112
112
|
|
|
113
113
|
|
|
114
114
|
- name: Cache huggingface datasets
|
|
115
|
-
uses: actions/cache@
|
|
115
|
+
uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
|
|
116
116
|
with:
|
|
117
117
|
path: ~/.cache/huggingface
|
|
118
118
|
# Invalidate cache when any example notebook changes (may affect dataset downloads)
|
|
@@ -140,7 +140,7 @@ jobs:
|
|
|
140
140
|
flags: integration
|
|
141
141
|
|
|
142
142
|
- name: Upload integration test artifacts
|
|
143
|
-
uses: actions/upload-artifact@
|
|
143
|
+
uses: actions/upload-artifact@v6
|
|
144
144
|
if: always()
|
|
145
145
|
with:
|
|
146
146
|
name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
|
|
@@ -72,7 +72,7 @@ jobs:
|
|
|
72
72
|
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
|
|
73
73
|
|
|
74
74
|
- name: "Download build artifacts"
|
|
75
|
-
uses: actions/download-artifact@
|
|
75
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
76
76
|
with:
|
|
77
77
|
name: Packages
|
|
78
78
|
path: dist
|
|
@@ -104,13 +104,13 @@ jobs:
|
|
|
104
104
|
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
|
|
105
105
|
|
|
106
106
|
- name: "Download build artifacts"
|
|
107
|
-
uses: actions/download-artifact@
|
|
107
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
108
108
|
with:
|
|
109
109
|
name: Packages
|
|
110
110
|
path: dist
|
|
111
111
|
|
|
112
112
|
- name: "Sigstore sign package"
|
|
113
|
-
uses: sigstore/gh-action-sigstore-python@
|
|
113
|
+
uses: sigstore/gh-action-sigstore-python@a5caf349bc536fbef3668a10ed7f5cd309a4b53d # v3.2.0
|
|
114
114
|
with:
|
|
115
115
|
inputs: |
|
|
116
116
|
./dist/*.tar.gz
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdg_hub
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Synthetic Data Generation
|
|
5
5
|
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -26,7 +26,7 @@ Requires-Dist: click<9.0.0,>=8.1.7
|
|
|
26
26
|
Requires-Dist: datasets>=4.0.0
|
|
27
27
|
Requires-Dist: httpx<1.0.0,>=0.25.0
|
|
28
28
|
Requires-Dist: jinja2
|
|
29
|
-
Requires-Dist: litellm<
|
|
29
|
+
Requires-Dist: litellm<2.0.0,>=1.73.0
|
|
30
30
|
Requires-Dist: rich
|
|
31
31
|
Requires-Dist: pandas
|
|
32
32
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
@@ -603,7 +603,7 @@ print(result["judgment"]) # ['YES']
|
|
|
603
603
|
TextParserBlock is commonly used after LLMChatBlock to structure responses:
|
|
604
604
|
|
|
605
605
|
```python
|
|
606
|
-
from sdg_hub.core.blocks import LLMChatBlock,
|
|
606
|
+
from sdg_hub.core.blocks import LLMChatBlock, LLMResponseExtractorBlock, TextParserBlock
|
|
607
607
|
|
|
608
608
|
# Step 1: Generate LLM response
|
|
609
609
|
chat_block = LLMChatBlock(
|
|
@@ -615,7 +615,7 @@ chat_block = LLMChatBlock(
|
|
|
615
615
|
|
|
616
616
|
# Step 2: Extract content from response object
|
|
617
617
|
# Use field_prefix="" to get cleaner column names
|
|
618
|
-
llm_parser =
|
|
618
|
+
llm_parser = LLMResponseExtractorBlock(
|
|
619
619
|
block_name="extract_eval",
|
|
620
620
|
input_cols=["eval_response"],
|
|
621
621
|
extract_content=True,
|
|
@@ -316,7 +316,7 @@ blocks:
|
|
|
316
316
|
output_cols: ["eval_response"]
|
|
317
317
|
async_mode: true
|
|
318
318
|
|
|
319
|
-
- block_type: "
|
|
319
|
+
- block_type: "LLMResponseExtractorBlock"
|
|
320
320
|
block_config:
|
|
321
321
|
block_name: "extract_eval_content"
|
|
322
322
|
input_cols: ["eval_response"]
|
|
@@ -537,7 +537,7 @@ result = flow.generate(
|
|
|
537
537
|
| | `top_p` | Nucleus sampling threshold | `0.0` - `1.0` |
|
|
538
538
|
| | `frequency_penalty` | Penalize token repetition | `-2.0` - `2.0` |
|
|
539
539
|
| | `presence_penalty` | Penalize new topics | `-2.0` - `2.0` |
|
|
540
|
-
| **
|
|
540
|
+
| **LLMResponseExtractorBlock** | `extract_content` | Extract main content field | `True`, `False` |
|
|
541
541
|
| | `extract_reasoning_content` | Extract reasoning/thinking | `True`, `False` |
|
|
542
542
|
| | `extract_tool_calls` | Extract tool call data | `True`, `False` |
|
|
543
543
|
| | `field_prefix` | Prefix for output fields | `"llm_"`, `"parsed_"` |
|
|
@@ -752,7 +752,7 @@ result = flow.generate(dataset)
|
|
|
752
752
|
│ │ generate_question │ LLMChatBlock │ 45.30s │ 100 → 100 │ +1 │ ✓││
|
|
753
753
|
│ │ generate_answer │ LLMChatBlock │ 78.45s │ 100 → 100 │ +1 │ ✓││
|
|
754
754
|
│ │ eval_faithfulness... │ LLMChatBlock │ 52.20s │ 100 → 100 │ +1 │ ✓││
|
|
755
|
-
│ │ extract_eval_con... │
|
|
755
|
+
│ │ extract_eval_con... │ LLMResponseExtractorBlock │ 0.15s │ 100 → 100 │ +2 │ ✓││
|
|
756
756
|
│ │ parse_evaluation │ TextParserBlock │ 0.22s │ 100 → 100 │ +2 │ ✓││
|
|
757
757
|
│ │ filter_faithful │ ColumnValueF... │ 0.08s │ 100 → 87 │ — │ ✓││
|
|
758
758
|
│ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
|
|
@@ -48,29 +48,38 @@ Only claims passing this check are retained. This process filters out **hallucin
|
|
|
48
48
|
|
|
49
49
|
---
|
|
50
50
|
|
|
51
|
-
## Data Generation Statistics
|
|
51
|
+
## Data Generation Statistics and Results
|
|
52
|
+
|
|
53
|
+
**Teacher model for generation:** `openai/gpt-oss-120b`
|
|
54
|
+
**Student model trained:** `meta-llama/Llama-3.1-8B-Instruct`
|
|
55
|
+
**Training method:** Supervised Fine-Tuning (SFT)
|
|
56
|
+
|
|
57
|
+
---
|
|
52
58
|
|
|
53
59
|
### Summary Augmentation
|
|
54
60
|
|
|
55
|
-
Each “cut” represents the total number of
|
|
61
|
+
For each document, we generate three augmentation types—detailed summaries, extractive summaries, and atomic facts. Each “cut” on the table below represents the total number of summary augmentations per document (i.e., how many times each augmentation process is run).
|
|
56
62
|
|
|
57
|
-
| Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count
|
|
58
|
-
| ------------------------------- |
|
|
59
|
-
|
|
|
60
|
-
|
|
|
61
|
-
|
|
|
62
|
-
|
|
|
63
|
-
|
|
|
64
|
-
|
|
|
65
|
-
| 40 | 87,118,308 |
|
|
66
|
-
| 50 | 108,779,213 |
|
|
63
|
+
| Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
|
|
64
|
+
| ------------------------------- | ------------- |
|
|
65
|
+
| Input Corpus | 1,517,465 |
|
|
66
|
+
| 10 | 87,248,889 |
|
|
67
|
+
| 20 | 158,615,276 |
|
|
68
|
+
| 30 | 230,306,195 |
|
|
69
|
+
| 40 | 301,805,906 |
|
|
70
|
+
| 50 | 373,183,414 |
|
|
67
71
|
|
|
68
72
|
---
|
|
69
73
|
|
|
70
|
-
###
|
|
74
|
+
### Benchmark Results
|
|
71
75
|
|
|
72
|
-
|
|
76
|
+
- **Evaluation benchmark:** [QuALITY benchmark](https://nyu-mll.github.io/quality/)
|
|
77
|
+
- **Evaluation script & metric:** [Synthetic_Continued_Pretraining](https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/evaluation.py), Exact Match (EM)
|
|
78
|
+
- **Student model:** meta-llama/Llama-3.1-8B-Instruct (after SFT on generated/augmented summaries)
|
|
79
|
+
- **Performance metric:** Model accuracy
|
|
73
80
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
81
|
+

|
|
82
|
+
|
|
83
|
+
*Figure: Model accuracy across the QuALITY benchmark datasets, comparing SFT training on enhanced document summaries with the original model performance.*
|
|
84
|
+
|
|
85
|
+
---
|
|
Binary file
|
|
@@ -602,13 +602,14 @@ def _num_chars_from_tokens(num_tokens) -> int:
|
|
|
602
602
|
return int(num_tokens * 4) # 1 token ~ 4 English character
|
|
603
603
|
|
|
604
604
|
|
|
605
|
-
def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
|
|
605
|
+
def chunk_document(documents: List, server_ctx_size, chunk_word_count, **kwargs) -> List[str]:
|
|
606
606
|
"""
|
|
607
607
|
Iterates over the documents and splits them into chunks based on the word count provided by the user.
|
|
608
608
|
Args:
|
|
609
609
|
documents (list): List of documents retrieved from git (can also consist of a single document).
|
|
610
610
|
server_ctx_size (int): Context window size of server.
|
|
611
611
|
chunk_word_count (int): Maximum number of words to chunk a document.
|
|
612
|
+
chunk_overlap (int): Overlap in characters between chunks.
|
|
612
613
|
Returns:
|
|
613
614
|
List[str]: List of chunked documents.
|
|
614
615
|
"""
|
|
@@ -634,7 +635,7 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
|
|
|
634
635
|
# Placeholder for params
|
|
635
636
|
content = []
|
|
636
637
|
chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
|
|
637
|
-
chunk_overlap = _DEFAULT_CHUNK_OVERLAP
|
|
638
|
+
chunk_overlap = int(kwargs.pop("chunk_overlap", str(_DEFAULT_CHUNK_OVERLAP)))
|
|
638
639
|
|
|
639
640
|
# Using Markdown as default, document-specific chunking will be implemented in seperate pr.
|
|
640
641
|
text_splitter = RecursiveCharacterTextSplitter.from_language(
|
|
@@ -729,16 +730,21 @@ class DocProcessor:
|
|
|
729
730
|
}
|
|
730
731
|
)
|
|
731
732
|
|
|
732
|
-
def _add_icls(self, chunked_document: Dataset) -> Dataset:
|
|
733
|
+
def _add_icls(self, chunked_document: Dataset, **kwargs) -> Dataset:
|
|
733
734
|
"""
|
|
734
735
|
Add the ICLS label to the dataset.
|
|
735
736
|
Args:
|
|
736
737
|
dataset (Dataset): Dataset object.
|
|
738
|
+
server_ctx_size (int): Context window size of server.
|
|
739
|
+
chunk_word_count (int): Maximum number of words to chunk a document.
|
|
740
|
+
chunk_overlap (int): Overlap in characters between chunks.
|
|
737
741
|
|
|
738
742
|
Returns
|
|
739
743
|
-------
|
|
740
744
|
Dataset: Dataset object with ICLS label.
|
|
741
745
|
"""
|
|
746
|
+
server_ctx_size = int(kwargs.pop("server_ctx_size", "4096"))
|
|
747
|
+
chunk_word_count = int(kwargs.pop("chunk_word_count", "1024"))
|
|
742
748
|
icl = self.user_config["seed_examples"]
|
|
743
749
|
chunked_document_all_icl = []
|
|
744
750
|
for icl_ in icl:
|
|
@@ -762,7 +768,7 @@ class DocProcessor:
|
|
|
762
768
|
chunked_document_all_icl = chunked_document_all_icl.map(
|
|
763
769
|
lambda x: {
|
|
764
770
|
"chunks": chunk_document(
|
|
765
|
-
[x["document"]], server_ctx_size=
|
|
771
|
+
[x["document"]], server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, **kwargs
|
|
766
772
|
)
|
|
767
773
|
if get_token_count(x["document"], self.tokenizer) > 1024
|
|
768
774
|
else [x["document"]]
|
|
@@ -797,7 +803,7 @@ class DocProcessor:
|
|
|
797
803
|
df = safe_concatenate_datasets([ds.to_pandas() for ds in datasets])
|
|
798
804
|
return Dataset.from_pandas(df) if df is not None else None
|
|
799
805
|
|
|
800
|
-
def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
|
|
806
|
+
def get_processed_markdown_dataset(self, list_md_files: list[Path], **kwargs) -> Dataset:
|
|
801
807
|
chunks_mds = []
|
|
802
808
|
for md_file in list_md_files:
|
|
803
809
|
with open(md_file, "r", encoding="utf-8") as f:
|
|
@@ -811,5 +817,5 @@ class DocProcessor:
|
|
|
811
817
|
}
|
|
812
818
|
)
|
|
813
819
|
chunk_ds = Dataset.from_list(chunks_mds)
|
|
814
|
-
chunk_ds_with_icls = self._add_icls(chunk_ds)
|
|
820
|
+
chunk_ds_with_icls = self._add_icls(chunk_ds, **kwargs)
|
|
815
821
|
return chunk_ds_with_icls
|
|
@@ -332,7 +332,7 @@
|
|
|
332
332
|
" LLMChatBlock,\n",
|
|
333
333
|
" PromptBuilderBlock,\n",
|
|
334
334
|
" TextParserBlock,\n",
|
|
335
|
-
"
|
|
335
|
+
" LLMResponseExtractorBlock,\n",
|
|
336
336
|
")\n",
|
|
337
337
|
"from sdg_hub.core.blocks.transform import JSONStructureBlock\n",
|
|
338
338
|
"\n",
|
|
@@ -355,7 +355,7 @@
|
|
|
355
355
|
" temperature=0.1, # Low temperature for more consistent extraction\n",
|
|
356
356
|
")\n",
|
|
357
357
|
"\n",
|
|
358
|
-
"
|
|
358
|
+
"ticker_llm_response_extractor_block = LLMResponseExtractorBlock(\n",
|
|
359
359
|
" block_name=\"extract_stock_tickers\",\n",
|
|
360
360
|
" input_cols=[\"raw_stock_tickers\"],\n",
|
|
361
361
|
" extract_content=True,\n",
|
|
@@ -406,7 +406,7 @@
|
|
|
406
406
|
"ticker_blocks = [\n",
|
|
407
407
|
" ticker_prompt_block,\n",
|
|
408
408
|
" ticker_llm_block,\n",
|
|
409
|
-
"
|
|
409
|
+
" ticker_llm_response_extractor_block,\n",
|
|
410
410
|
" ticker_parser_block,\n",
|
|
411
411
|
" enhanced_json_block,\n",
|
|
412
412
|
"]\n",
|
|
@@ -33,7 +33,7 @@ dependencies = [
|
|
|
33
33
|
"datasets>=4.0.0",
|
|
34
34
|
"httpx>=0.25.0,<1.0.0",
|
|
35
35
|
"jinja2",
|
|
36
|
-
"litellm>=1.73.0,<1.75.0
|
|
36
|
+
"litellm>=1.73.0,<2.0.0", # raising cap since tests run without errors related to 'backoff' cap back to <1.75.0 if errors surface
|
|
37
37
|
"rich",
|
|
38
38
|
"pandas",
|
|
39
39
|
"pydantic>=2.0.0,<3.0.0", # cap before v3; adjust the lower bound to the minimum v2.x you’ve tested
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.7.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 7,
|
|
31
|
+
__version__ = version = '0.7.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 7, 3)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g97824a47f'
|
|
@@ -6,7 +6,13 @@ This package provides various block implementations for data generation, process
|
|
|
6
6
|
# Local
|
|
7
7
|
from .base import BaseBlock
|
|
8
8
|
from .filtering import ColumnValueFilterBlock
|
|
9
|
-
from .llm import
|
|
9
|
+
from .llm import (
|
|
10
|
+
LLMChatBlock,
|
|
11
|
+
LLMParserBlock,
|
|
12
|
+
LLMResponseExtractorBlock,
|
|
13
|
+
PromptBuilderBlock,
|
|
14
|
+
TextParserBlock,
|
|
15
|
+
)
|
|
10
16
|
from .registry import BlockRegistry
|
|
11
17
|
from .transform import (
|
|
12
18
|
DuplicateColumnsBlock,
|
|
@@ -28,7 +34,8 @@ __all__ = [
|
|
|
28
34
|
"TextConcatBlock",
|
|
29
35
|
"UniformColumnValueSetter",
|
|
30
36
|
"LLMChatBlock",
|
|
31
|
-
"LLMParserBlock",
|
|
37
|
+
"LLMParserBlock", # Deprecated alias for LLMResponseExtractorBlock
|
|
38
|
+
"LLMResponseExtractorBlock",
|
|
32
39
|
"TextParserBlock",
|
|
33
40
|
"PromptBuilderBlock",
|
|
34
41
|
]
|
|
@@ -49,6 +49,9 @@ class BaseBlock(BaseModel, ABC):
|
|
|
49
49
|
block_name: str = Field(
|
|
50
50
|
..., description="Unique identifier for this block instance"
|
|
51
51
|
)
|
|
52
|
+
block_type: Optional[str] = Field(
|
|
53
|
+
None, description="Block type (e.g., 'llm', 'transform', 'parser', 'filtering')"
|
|
54
|
+
)
|
|
52
55
|
input_cols: Union[str, list[str], dict[str, Any], None] = Field(
|
|
53
56
|
None, description="Input columns: str, list, or dict"
|
|
54
57
|
)
|
|
@@ -366,5 +369,5 @@ class BaseBlock(BaseModel, ABC):
|
|
|
366
369
|
Dict[str, Any]
|
|
367
370
|
"""
|
|
368
371
|
config = self.get_config()
|
|
369
|
-
config["
|
|
372
|
+
config["block_class"] = self.__class__.__name__
|
|
370
373
|
return config
|
|
@@ -46,6 +46,8 @@ DTYPE_MAP = {
|
|
|
46
46
|
"Filters datasets based on column values using various comparison operations",
|
|
47
47
|
)
|
|
48
48
|
class ColumnValueFilterBlock(BaseBlock):
|
|
49
|
+
block_type: str = "filtering"
|
|
50
|
+
|
|
49
51
|
"""A block for filtering datasets based on column values.
|
|
50
52
|
|
|
51
53
|
This block allows filtering of datasets using various operations (e.g., equals, contains)
|
|
@@ -9,7 +9,7 @@ local models (vLLM, Ollama), and more.
|
|
|
9
9
|
# Local
|
|
10
10
|
from .error_handler import ErrorCategory, LLMErrorHandler
|
|
11
11
|
from .llm_chat_block import LLMChatBlock
|
|
12
|
-
from .
|
|
12
|
+
from .llm_response_extractor_block import LLMParserBlock, LLMResponseExtractorBlock
|
|
13
13
|
from .prompt_builder_block import PromptBuilderBlock
|
|
14
14
|
from .text_parser_block import TextParserBlock
|
|
15
15
|
|
|
@@ -17,7 +17,8 @@ __all__ = [
|
|
|
17
17
|
"LLMErrorHandler",
|
|
18
18
|
"ErrorCategory",
|
|
19
19
|
"LLMChatBlock",
|
|
20
|
-
"LLMParserBlock",
|
|
20
|
+
"LLMParserBlock", # Deprecated alias for LLMResponseExtractorBlock
|
|
21
|
+
"LLMResponseExtractorBlock",
|
|
21
22
|
"PromptBuilderBlock",
|
|
22
23
|
"TextParserBlock",
|
|
23
24
|
]
|
|
@@ -6,7 +6,8 @@ from typing import Any, Optional
|
|
|
6
6
|
import asyncio
|
|
7
7
|
|
|
8
8
|
from litellm import acompletion, completion
|
|
9
|
-
from pydantic import ConfigDict, Field, field_validator
|
|
9
|
+
from pydantic import ConfigDict, Field, SecretStr, field_validator
|
|
10
|
+
from tqdm.asyncio import tqdm_asyncio
|
|
10
11
|
import litellm
|
|
11
12
|
|
|
12
13
|
# Third Party
|
|
@@ -31,6 +32,8 @@ logger = setup_logger(__name__)
|
|
|
31
32
|
class LLMChatBlock(BaseBlock):
|
|
32
33
|
model_config = ConfigDict(extra="allow")
|
|
33
34
|
|
|
35
|
+
block_type: str = "llm"
|
|
36
|
+
|
|
34
37
|
"""Unified LLM chat block supporting all providers via LiteLLM.
|
|
35
38
|
|
|
36
39
|
This block provides a minimal wrapper around LiteLLM's completion API,
|
|
@@ -52,8 +55,9 @@ class LLMChatBlock(BaseBlock):
|
|
|
52
55
|
model : Optional[str], optional
|
|
53
56
|
Model identifier in LiteLLM format. Can be set later via flow.set_model_config().
|
|
54
57
|
Examples: "openai/gpt-4", "anthropic/claude-3-sonnet-20240229"
|
|
55
|
-
api_key : Optional[
|
|
58
|
+
api_key : Optional[SecretStr], optional
|
|
56
59
|
API key for the provider. Falls back to environment variables.
|
|
60
|
+
Automatically redacted in logs and string representations.
|
|
57
61
|
api_base : Optional[str], optional
|
|
58
62
|
Base URL for the API. Required for local models.
|
|
59
63
|
async_mode : bool, optional
|
|
@@ -97,7 +101,7 @@ class LLMChatBlock(BaseBlock):
|
|
|
97
101
|
model: Optional[str] = Field(
|
|
98
102
|
None, exclude=True, description="Model identifier in LiteLLM format"
|
|
99
103
|
)
|
|
100
|
-
api_key: Optional[
|
|
104
|
+
api_key: Optional[SecretStr] = Field(
|
|
101
105
|
None, exclude=True, description="API key for the provider"
|
|
102
106
|
)
|
|
103
107
|
api_base: Optional[str] = Field(
|
|
@@ -301,7 +305,7 @@ class LLMChatBlock(BaseBlock):
|
|
|
301
305
|
if self.model is not None:
|
|
302
306
|
completion_kwargs["model"] = self.model
|
|
303
307
|
if self.api_key is not None:
|
|
304
|
-
completion_kwargs["api_key"] = self.api_key
|
|
308
|
+
completion_kwargs["api_key"] = self.api_key.get_secret_value()
|
|
305
309
|
if self.api_base is not None:
|
|
306
310
|
completion_kwargs["api_base"] = self.api_base
|
|
307
311
|
if self.timeout is not None:
|
|
@@ -501,7 +505,9 @@ class LLMChatBlock(BaseBlock):
|
|
|
501
505
|
for messages in messages_list
|
|
502
506
|
]
|
|
503
507
|
|
|
504
|
-
responses = await
|
|
508
|
+
responses = await tqdm_asyncio.gather(
|
|
509
|
+
*tasks, desc=self.block_name, unit="req"
|
|
510
|
+
)
|
|
505
511
|
return responses
|
|
506
512
|
|
|
507
513
|
except Exception as e:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
-
"""LLM
|
|
2
|
+
"""LLM response extractor block for extracting fields from LLM response objects.
|
|
3
3
|
|
|
4
|
-
This module provides the
|
|
4
|
+
This module provides the LLMResponseExtractorBlock for extracting specific fields
|
|
5
5
|
(content, reasoning_content, tool_calls) from chat completion response objects.
|
|
6
6
|
"""
|
|
7
7
|
|
|
@@ -22,13 +22,15 @@ logger = setup_logger(__name__)
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
@BlockRegistry.register(
|
|
25
|
-
"
|
|
25
|
+
"LLMResponseExtractorBlock",
|
|
26
26
|
"llm",
|
|
27
27
|
"Extracts specified fields from LLM response objects",
|
|
28
28
|
)
|
|
29
|
-
class
|
|
29
|
+
class LLMResponseExtractorBlock(BaseBlock):
|
|
30
30
|
_flow_requires_jsonl_tmp: bool = True
|
|
31
31
|
|
|
32
|
+
block_type: str = "llm_util"
|
|
33
|
+
|
|
32
34
|
"""Block for extracting fields from LLM response objects.
|
|
33
35
|
|
|
34
36
|
This block extracts specified fields from chat completion response objects.
|
|
@@ -88,7 +90,7 @@ class LLMParserBlock(BaseBlock):
|
|
|
88
90
|
]
|
|
89
91
|
):
|
|
90
92
|
raise ValueError(
|
|
91
|
-
"
|
|
93
|
+
"LLMResponseExtractorBlock requires at least one extraction field to be enabled: "
|
|
92
94
|
"extract_content, extract_reasoning_content, or extract_tool_calls"
|
|
93
95
|
)
|
|
94
96
|
|
|
@@ -106,7 +108,7 @@ class LLMParserBlock(BaseBlock):
|
|
|
106
108
|
return self
|
|
107
109
|
|
|
108
110
|
def _validate_custom(self, dataset: pd.DataFrame) -> None:
|
|
109
|
-
"""Validate
|
|
111
|
+
"""Validate LLMResponseExtractorBlock specific requirements.
|
|
110
112
|
|
|
111
113
|
Parameters
|
|
112
114
|
----------
|
|
@@ -116,14 +118,16 @@ class LLMParserBlock(BaseBlock):
|
|
|
116
118
|
Raises
|
|
117
119
|
------
|
|
118
120
|
ValueError
|
|
119
|
-
If
|
|
121
|
+
If LLMResponseExtractorBlock requirements are not met.
|
|
120
122
|
"""
|
|
121
123
|
# Validate that we have exactly one input column
|
|
122
124
|
if len(self.input_cols) == 0:
|
|
123
|
-
raise ValueError(
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"LLMResponseExtractorBlock expects at least one input column"
|
|
127
|
+
)
|
|
124
128
|
if len(self.input_cols) > 1:
|
|
125
129
|
logger.warning(
|
|
126
|
-
f"
|
|
130
|
+
f"LLMResponseExtractorBlock expects exactly one input column, but got {len(self.input_cols)}. "
|
|
127
131
|
f"Using the first column: {self.input_cols[0]}"
|
|
128
132
|
)
|
|
129
133
|
|
|
@@ -324,3 +328,22 @@ class LLMParserBlock(BaseBlock):
|
|
|
324
328
|
new_data.extend(self._generate(sample))
|
|
325
329
|
|
|
326
330
|
return pd.DataFrame(new_data)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# Backwards compatibility alias (deprecated)
|
|
334
|
+
# Register deprecated alias in BlockRegistry so old YAML flows still work
|
|
335
|
+
@BlockRegistry.register(
|
|
336
|
+
"LLMParserBlock",
|
|
337
|
+
"llm",
|
|
338
|
+
"Deprecated: Use LLMResponseExtractorBlock instead",
|
|
339
|
+
deprecated=True,
|
|
340
|
+
replacement="LLMResponseExtractorBlock",
|
|
341
|
+
)
|
|
342
|
+
class LLMParserBlock(LLMResponseExtractorBlock):
|
|
343
|
+
"""Deprecated alias for LLMResponseExtractorBlock.
|
|
344
|
+
|
|
345
|
+
This class exists for backwards compatibility with existing code and YAML flows.
|
|
346
|
+
Use LLMResponseExtractorBlock instead.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
pass
|
|
@@ -222,6 +222,8 @@ class PromptRenderer:
|
|
|
222
222
|
"Formats prompts into structured chat messages or plain text using Jinja templates",
|
|
223
223
|
)
|
|
224
224
|
class PromptBuilderBlock(BaseBlock):
|
|
225
|
+
block_type: str = "llm_util"
|
|
226
|
+
|
|
225
227
|
"""Block for formatting prompts into structured chat messages or plain text.
|
|
226
228
|
|
|
227
229
|
This block takes input from dataset columns, applies Jinja templates from a YAML config
|
|
@@ -30,6 +30,8 @@ logger = setup_logger(__name__)
|
|
|
30
30
|
class TextParserBlock(BaseBlock):
|
|
31
31
|
_flow_requires_jsonl_tmp: bool = True
|
|
32
32
|
|
|
33
|
+
block_type: str = "parser"
|
|
34
|
+
|
|
33
35
|
"""Block for parsing and post-processing text content.
|
|
34
36
|
|
|
35
37
|
This block handles text parsing using start/end tags, custom regex patterns,
|
|
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
|
|
|
27
27
|
"Duplicates existing columns with new names according to a mapping specification",
|
|
28
28
|
)
|
|
29
29
|
class DuplicateColumnsBlock(BaseBlock):
|
|
30
|
+
block_type: str = "transform"
|
|
31
|
+
|
|
30
32
|
"""Block for duplicating existing columns with new names.
|
|
31
33
|
|
|
32
34
|
This block creates copies of existing columns with new names according to a mapping specification.
|
|
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
|
|
|
28
28
|
"Maps values from source columns to output columns based on choice columns using shared mapping",
|
|
29
29
|
)
|
|
30
30
|
class IndexBasedMapperBlock(BaseBlock):
|
|
31
|
+
block_type: str = "transform"
|
|
32
|
+
|
|
31
33
|
"""Block for mapping values from source columns to output columns based on choice columns.
|
|
32
34
|
|
|
33
35
|
This block uses a shared mapping dictionary to select values from source columns and
|
|
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
|
|
|
28
28
|
"Combines multiple columns into a single column containing a structured JSON object",
|
|
29
29
|
)
|
|
30
30
|
class JSONStructureBlock(BaseBlock):
|
|
31
|
+
block_type: str = "transform"
|
|
32
|
+
|
|
31
33
|
"""Block for combining multiple columns into a structured JSON object.
|
|
32
34
|
|
|
33
35
|
This block takes values from multiple input columns and combines them into a single
|
|
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
|
|
|
28
28
|
"Transforms wide dataset format into long format by melting columns into rows",
|
|
29
29
|
)
|
|
30
30
|
class MeltColumnsBlock(BaseBlock):
|
|
31
|
+
block_type: str = "transform"
|
|
32
|
+
|
|
31
33
|
"""Block for flattening multiple columns into a long format.
|
|
32
34
|
|
|
33
35
|
This block transforms a wide dataset format into a long format by melting
|
|
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
|
|
|
27
27
|
"Renames columns in a dataset according to a mapping specification",
|
|
28
28
|
)
|
|
29
29
|
class RenameColumnsBlock(BaseBlock):
|
|
30
|
+
block_type: str = "transform"
|
|
31
|
+
|
|
30
32
|
"""Block for renaming columns in a dataset.
|
|
31
33
|
|
|
32
34
|
This block renames columns in a dataset according to a mapping specification.
|
|
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
|
|
|
27
27
|
"Combines multiple columns into a single column using a specified separator",
|
|
28
28
|
)
|
|
29
29
|
class TextConcatBlock(BaseBlock):
|
|
30
|
+
block_type: str = "transform"
|
|
31
|
+
|
|
30
32
|
"""Block for combining multiple columns into a single column.
|
|
31
33
|
|
|
32
34
|
This block concatenates values from multiple columns into a single output column,
|
|
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
|
|
|
28
28
|
"Replaces all values in a column with a single summary statistic (e.g., mode, mean, median)",
|
|
29
29
|
)
|
|
30
30
|
class UniformColumnValueSetter(BaseBlock):
|
|
31
|
+
block_type: str = "transform"
|
|
32
|
+
|
|
31
33
|
"""Block that replaces all values in a column with a single aggregate value.
|
|
32
34
|
|
|
33
35
|
Supported strategies include: mode, min, max, mean, median.
|