sdg-hub 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/actionlint.dockerfile +1 -1
  2. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/docs.yml +1 -1
  3. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/integration-test.yml +2 -2
  4. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/pypi.yaml +3 -3
  5. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/PKG-INFO +2 -2
  6. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/llm-blocks.md +2 -2
  7. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/overview.md +3 -3
  8. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +26 -17
  9. sdg_hub-0.7.3/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/imgs/quality_benchmark_accuracy.png +0 -0
  10. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/knowledge_utils.py +12 -6
  11. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/structured_insights_demo.ipynb +3 -3
  12. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/pyproject.toml +1 -1
  13. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/_version.py +3 -3
  14. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/__init__.py +9 -2
  15. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/base.py +4 -1
  16. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +2 -0
  17. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/__init__.py +3 -2
  18. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +11 -5
  19. sdg_hub-0.7.1/src/sdg_hub/core/blocks/llm/llm_parser_block.py → sdg_hub-0.7.3/src/sdg_hub/core/blocks/llm/llm_response_extractor_block.py +32 -9
  20. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +2 -0
  21. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/text_parser_block.py +2 -0
  22. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +2 -0
  23. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +2 -0
  24. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/json_structure_block.py +2 -0
  25. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/melt_columns.py +2 -0
  26. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/rename_columns.py +2 -0
  27. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/text_concat.py +2 -0
  28. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +2 -0
  29. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/base.py +13 -32
  30. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_metrics.py +3 -3
  31. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/flow.yaml +6 -6
  32. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +4 -4
  33. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +3 -3
  34. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +4 -4
  35. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +2 -2
  36. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +7 -7
  37. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +7 -7
  38. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +4 -4
  39. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/PKG-INFO +2 -2
  40. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/SOURCES.txt +3 -2
  41. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/requires.txt +1 -1
  42. sdg_hub-0.7.1/tests/blocks/llm/test_llm_parser_block.py → sdg_hub-0.7.3/tests/blocks/llm/test_llm_response_extractor_block.py +55 -52
  43. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_promptbuilderblock.py +1 -1
  44. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/test_base_block.py +4 -3
  45. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_base.py +78 -4
  46. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_flow_metrics.py +11 -11
  47. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/actionlint.yaml +0 -0
  48. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/actions/free-disk-space/action.yml +0 -0
  49. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/dependabot.yml +0 -0
  50. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/mergify.yml +0 -0
  51. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/actionlint.yml +0 -0
  52. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/lint.yml +0 -0
  53. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/matchers/actionlint.json +0 -0
  54. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/matchers/pylint.json +0 -0
  55. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/packer.yml +0 -0
  56. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.github/workflows/test.yml +0 -0
  57. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.gitignore +0 -0
  58. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.isort.cfg +0 -0
  59. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.markdownlint-cli2.yaml +0 -0
  60. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.pre-commit-config.yaml +0 -0
  61. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/.pylintrc +0 -0
  62. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/CLAUDE.md +0 -0
  63. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/CONTRIBUTING.md +0 -0
  64. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/LICENSE +0 -0
  65. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/Makefile +0 -0
  66. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/README.md +0 -0
  67. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/.nojekyll +0 -0
  68. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/README.md +0 -0
  69. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_coverpage.md +0 -0
  70. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_navbar.md +0 -0
  71. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/_sidebar.md +0 -0
  72. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/api-reference.md +0 -0
  73. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/assets/logo.png +0 -0
  74. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/assets/sdg-hub-cover.png +0 -0
  75. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/custom-blocks.md +0 -0
  76. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/filtering-blocks.md +0 -0
  77. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/overview.md +0 -0
  78. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/blocks/transform-blocks.md +0 -0
  79. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/concepts.md +0 -0
  80. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/development.md +0 -0
  81. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/available-flows.md +0 -0
  82. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/custom-flows.md +0 -0
  83. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/flows/discovery.md +0 -0
  84. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/index.html +0 -0
  85. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/installation.md +0 -0
  86. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/docs/quick-start.md +0 -0
  87. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +0 -0
  88. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +0 -0
  89. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +0 -0
  90. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +0 -0
  91. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  92. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +0 -0
  93. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  94. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/README.md +0 -0
  95. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  96. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  97. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  98. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  99. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  100. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  101. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  102. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  103. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +0 -0
  104. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  105. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/knowledge_generation_ja.ipynb +0 -0
  106. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  107. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/rag_evaluation/ibm-annual-report-2024.pdf +0 -0
  108. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/rag_evaluation/rag_evaluation_dataset_generation.ipynb +0 -0
  109. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/README.md +0 -0
  110. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  111. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/packer/centos.pkr.hcl +0 -0
  112. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/packer/setup-centos.sh +0 -0
  113. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/ruff.sh +0 -0
  114. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/scripts/snyk_notebook_scan.sh +0 -0
  115. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/setup.cfg +0 -0
  116. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/__init__.py +0 -0
  117. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/__init__.py +0 -0
  118. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  119. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  120. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/registry.py +0 -0
  121. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  122. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/__init__.py +0 -0
  123. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  124. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/metadata.py +0 -0
  125. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/registry.py +0 -0
  126. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/flow/validation.py +0 -0
  127. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/__init__.py +0 -0
  128. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/datautils.py +0 -0
  129. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/error_handling.py +0 -0
  130. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  131. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  132. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/logger_config.py +0 -0
  133. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  134. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/time_estimator.py +0 -0
  135. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  136. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/__init__.py +0 -0
  137. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/answer_generation.yaml +0 -0
  138. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/conceptual_qa_generation.yaml +0 -0
  139. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/context_extraction.yaml +0 -0
  140. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/groundedness_critic.yaml +0 -0
  141. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/question_evolution.yaml +0 -0
  142. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/evaluation/rag/topic_generation.yaml +0 -0
  143. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  144. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  145. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  146. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  147. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  148. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  149. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  150. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  151. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  152. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  153. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  154. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  155. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  156. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  157. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  158. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  159. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  160. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  161. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  162. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  163. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  164. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  165. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  166. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  167. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  168. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  169. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  170. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  171. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  172. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  173. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  174. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  175. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub/py.typed +0 -0
  176. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  177. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/src/sdg_hub.egg-info/top_level.txt +0 -0
  178. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/__init__.py +0 -0
  179. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  180. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  181. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/llm/test_textparserblock.py +0 -0
  182. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/test_registry.py +0 -0
  183. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_config.yaml +0 -0
  184. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  185. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  186. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  187. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  188. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  189. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  190. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_json_structure_block.py +0 -0
  191. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_melt_columns.py +0 -0
  192. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_rename_columns.py +0 -0
  193. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_text_concat.py +0 -0
  194. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
  195. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/__init__.py +0 -0
  196. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/conftest.py +0 -0
  197. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_checkpointer.py +0 -0
  198. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_dataset_requirements.py +0 -0
  199. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_integration.py +0 -0
  200. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_metadata.py +0 -0
  201. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_registry.py +0 -0
  202. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_time_estimation.py +0 -0
  203. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/flow/test_validation.py +0 -0
  204. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/README.md +0 -0
  205. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/__init__.py +0 -0
  206. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  207. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +0 -0
  208. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +0 -0
  209. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +0 -0
  210. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +0 -0
  211. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_datautils.py +0 -0
  212. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_error_handling.py +0 -0
  213. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tests/utils/test_path_resolution.py +0 -0
  214. {sdg_hub-0.7.1 → sdg_hub-0.7.3}/tox.ini +0 -0
@@ -1,3 +1,3 @@
1
1
  # Since dependabot cannot update workflows using docker,
2
2
  # we use this indirection since dependabot can update this file.
3
- FROM rhysd/actionlint:1.7.9@sha256:a0383f60d92601e2694e24b24d37df7b6a40bed7cedbc447611c50009bf02d94
3
+ FROM rhysd/actionlint:1.7.10@sha256:ef8299f97635c4c30e2298f48f30763ab782a4ad2c95b744649439a039421e36
@@ -39,6 +39,6 @@ jobs:
39
39
  - name: "Checkout"
40
40
  uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
41
41
  - name: "Check Markdown documents"
42
- uses: DavidAnson/markdownlint-cli2-action@30a0e04f1870d58f8d717450cc6134995f993c63 # v21.0.0
42
+ uses: DavidAnson/markdownlint-cli2-action@07035fd053f7be764496c0f8d8f9f41f98305101 # v22.0.0
43
43
  with:
44
44
  globs: '**/*.md'
@@ -112,7 +112,7 @@ jobs:
112
112
 
113
113
 
114
114
  - name: Cache huggingface datasets
115
- uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
115
+ uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
116
116
  with:
117
117
  path: ~/.cache/huggingface
118
118
  # Invalidate cache when any example notebook changes (may affect dataset downloads)
@@ -140,7 +140,7 @@ jobs:
140
140
  flags: integration
141
141
 
142
142
  - name: Upload integration test artifacts
143
- uses: actions/upload-artifact@v5
143
+ uses: actions/upload-artifact@v6
144
144
  if: always()
145
145
  with:
146
146
  name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
@@ -72,7 +72,7 @@ jobs:
72
72
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
73
73
 
74
74
  - name: "Download build artifacts"
75
- uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
75
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
76
76
  with:
77
77
  name: Packages
78
78
  path: dist
@@ -104,13 +104,13 @@ jobs:
104
104
  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
105
105
 
106
106
  - name: "Download build artifacts"
107
- uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
107
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
108
108
  with:
109
109
  name: Packages
110
110
  path: dist
111
111
 
112
112
  - name: "Sigstore sign package"
113
- uses: sigstore/gh-action-sigstore-python@f832326173235dcb00dd5d92cd3f353de3188e6c # v3.1.0
113
+ uses: sigstore/gh-action-sigstore-python@a5caf349bc536fbef3668a10ed7f5cd309a4b53d # v3.2.0
114
114
  with:
115
115
  inputs: |
116
116
  ./dist/*.tar.gz
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -26,7 +26,7 @@ Requires-Dist: click<9.0.0,>=8.1.7
26
26
  Requires-Dist: datasets>=4.0.0
27
27
  Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
- Requires-Dist: litellm<1.75.0,>=1.73.0
29
+ Requires-Dist: litellm<2.0.0,>=1.73.0
30
30
  Requires-Dist: rich
31
31
  Requires-Dist: pandas
32
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
@@ -603,7 +603,7 @@ print(result["judgment"]) # ['YES']
603
603
  TextParserBlock is commonly used after LLMChatBlock to structure responses:
604
604
 
605
605
  ```python
606
- from sdg_hub.core.blocks import LLMChatBlock, LLMParserBlock, TextParserBlock
606
+ from sdg_hub.core.blocks import LLMChatBlock, LLMResponseExtractorBlock, TextParserBlock
607
607
 
608
608
  # Step 1: Generate LLM response
609
609
  chat_block = LLMChatBlock(
@@ -615,7 +615,7 @@ chat_block = LLMChatBlock(
615
615
 
616
616
  # Step 2: Extract content from response object
617
617
  # Use field_prefix="" to get cleaner column names
618
- llm_parser = LLMParserBlock(
618
+ llm_parser = LLMResponseExtractorBlock(
619
619
  block_name="extract_eval",
620
620
  input_cols=["eval_response"],
621
621
  extract_content=True,
@@ -316,7 +316,7 @@ blocks:
316
316
  output_cols: ["eval_response"]
317
317
  async_mode: true
318
318
 
319
- - block_type: "LLMParserBlock"
319
+ - block_type: "LLMResponseExtractorBlock"
320
320
  block_config:
321
321
  block_name: "extract_eval_content"
322
322
  input_cols: ["eval_response"]
@@ -537,7 +537,7 @@ result = flow.generate(
537
537
  | | `top_p` | Nucleus sampling threshold | `0.0` - `1.0` |
538
538
  | | `frequency_penalty` | Penalize token repetition | `-2.0` - `2.0` |
539
539
  | | `presence_penalty` | Penalize new topics | `-2.0` - `2.0` |
540
- | **LLMParserBlock** | `extract_content` | Extract main content field | `True`, `False` |
540
+ | **LLMResponseExtractorBlock** | `extract_content` | Extract main content field | `True`, `False` |
541
541
  | | `extract_reasoning_content` | Extract reasoning/thinking | `True`, `False` |
542
542
  | | `extract_tool_calls` | Extract tool call data | `True`, `False` |
543
543
  | | `field_prefix` | Prefix for output fields | `"llm_"`, `"parsed_"` |
@@ -752,7 +752,7 @@ result = flow.generate(dataset)
752
752
  │ │ generate_question │ LLMChatBlock │ 45.30s │ 100 → 100 │ +1 │ ✓││
753
753
  │ │ generate_answer │ LLMChatBlock │ 78.45s │ 100 → 100 │ +1 │ ✓││
754
754
  │ │ eval_faithfulness... │ LLMChatBlock │ 52.20s │ 100 → 100 │ +1 │ ✓││
755
- │ │ extract_eval_con... │ LLMParserBlock │ 0.15s │ 100 → 100 │ +2 │ ✓││
755
+ │ │ extract_eval_con... │ LLMResponseExtractorBlock │ 0.15s │ 100 → 100 │ +2 │ ✓││
756
756
  │ │ parse_evaluation │ TextParserBlock │ 0.22s │ 100 → 100 │ +2 │ ✓││
757
757
  │ │ filter_faithful │ ColumnValueF... │ 0.08s │ 100 → 87 │ — │ ✓││
758
758
  │ ├──────────────────────┼─────────────────┼──────────┼──────────────┼─────────┼──┤│
@@ -48,29 +48,38 @@ Only claims passing this check are retained. This process filters out **hallucin
48
48
 
49
49
  ---
50
50
 
51
- ## Data Generation Statistics
51
+ ## Data Generation Statistics and Results
52
+
53
+ **Teacher model for generation:** `openai/gpt-oss-120b`
54
+ **Student model trained:** `meta-llama/Llama-3.1-8B-Instruct`
55
+ **Training method:** Supervised Fine-Tuning (SFT)
56
+
57
+ ---
52
58
 
53
59
  ### Summary Augmentation
54
60
 
55
- Each “cut” represents the total number of summaries generated per document across all three augmentation types.
61
+ For each document, we generate three augmentation types—detailed summaries, extractive summaries, and atomic facts. Each “cut” on the table below represents the total number of summary augmentations per document (i.e., how many times each augmentation process is run).
56
62
 
57
- | Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
58
- | ------------------------------- | ----------- |
59
- | 1 | 2,193,502 |
60
- | 2 | 4,383,655 |
61
- | 5 | 10,870,396 |
62
- | 10 | 21,815,170 |
63
- | 20 | 43,601,976 |
64
- | 30 | 65,395,710 |
65
- | 40 | 87,118,308 |
66
- | 50 | 108,779,213 |
63
+ | Cut (NUMBER\_OF\_SUMMARIES = 3) | Token Count |
64
+ | ------------------------------- | ------------- |
65
+ | Input Corpus | 1,517,465 |
66
+ | 10 | 87,248,889 |
67
+ | 20 | 158,615,276 |
68
+ | 30 | 230,306,195 |
69
+ | 40 | 301,805,906 |
70
+ | 50 | 373,183,414 |
67
71
 
68
72
  ---
69
73
 
70
- ### Finance Bench Example
74
+ ### Benchmark Results
71
75
 
72
- For Finance Bench (NUMBER\_OF\_SUMMARIES = 1):
76
+ - **Evaluation benchmark:** [QuALITY benchmark](https://nyu-mll.github.io/quality/)
77
+ - **Evaluation script & metric:** [Synthetic_Continued_Pretraining](https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/evaluation.py), Exact Match (EM)
78
+ - **Student model:** meta-llama/Llama-3.1-8B-Instruct (after SFT on generated/augmented summaries)
79
+ - **Performance metric:** Model accuracy
73
80
 
74
- | Cut | Token Count |
75
- | --- | ----------- |
76
- | 50 | 213,333,192 |
81
+ ![Quality Benchmark Accuracy](imgs/quality_benchmark_accuracy.png)
82
+
83
+ *Figure: Model accuracy across the QuALITY benchmark datasets, comparing SFT training on enhanced document summaries with the original model performance.*
84
+
85
+ ---
@@ -602,13 +602,14 @@ def _num_chars_from_tokens(num_tokens) -> int:
602
602
  return int(num_tokens * 4) # 1 token ~ 4 English character
603
603
 
604
604
 
605
- def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
605
+ def chunk_document(documents: List, server_ctx_size, chunk_word_count, **kwargs) -> List[str]:
606
606
  """
607
607
  Iterates over the documents and splits them into chunks based on the word count provided by the user.
608
608
  Args:
609
609
  documents (list): List of documents retrieved from git (can also consist of a single document).
610
610
  server_ctx_size (int): Context window size of server.
611
611
  chunk_word_count (int): Maximum number of words to chunk a document.
612
+ chunk_overlap (int): Overlap in characters between chunks.
612
613
  Returns:
613
614
  List[str]: List of chunked documents.
614
615
  """
@@ -634,7 +635,7 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
634
635
  # Placeholder for params
635
636
  content = []
636
637
  chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
637
- chunk_overlap = _DEFAULT_CHUNK_OVERLAP
638
+ chunk_overlap = int(kwargs.pop("chunk_overlap", str(_DEFAULT_CHUNK_OVERLAP)))
638
639
 
639
640
  # Using Markdown as default, document-specific chunking will be implemented in seperate pr.
640
641
  text_splitter = RecursiveCharacterTextSplitter.from_language(
@@ -729,16 +730,21 @@ class DocProcessor:
729
730
  }
730
731
  )
731
732
 
732
- def _add_icls(self, chunked_document: Dataset) -> Dataset:
733
+ def _add_icls(self, chunked_document: Dataset, **kwargs) -> Dataset:
733
734
  """
734
735
  Add the ICLS label to the dataset.
735
736
  Args:
736
737
  dataset (Dataset): Dataset object.
738
+ server_ctx_size (int): Context window size of server.
739
+ chunk_word_count (int): Maximum number of words to chunk a document.
740
+ chunk_overlap (int): Overlap in characters between chunks.
737
741
 
738
742
  Returns
739
743
  -------
740
744
  Dataset: Dataset object with ICLS label.
741
745
  """
746
+ server_ctx_size = int(kwargs.pop("server_ctx_size", "4096"))
747
+ chunk_word_count = int(kwargs.pop("chunk_word_count", "1024"))
742
748
  icl = self.user_config["seed_examples"]
743
749
  chunked_document_all_icl = []
744
750
  for icl_ in icl:
@@ -762,7 +768,7 @@ class DocProcessor:
762
768
  chunked_document_all_icl = chunked_document_all_icl.map(
763
769
  lambda x: {
764
770
  "chunks": chunk_document(
765
- [x["document"]], server_ctx_size=4096, chunk_word_count=1024
771
+ [x["document"]], server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, **kwargs
766
772
  )
767
773
  if get_token_count(x["document"], self.tokenizer) > 1024
768
774
  else [x["document"]]
@@ -797,7 +803,7 @@ class DocProcessor:
797
803
  df = safe_concatenate_datasets([ds.to_pandas() for ds in datasets])
798
804
  return Dataset.from_pandas(df) if df is not None else None
799
805
 
800
- def get_processed_markdown_dataset(self, list_md_files: list[Path]) -> Dataset:
806
+ def get_processed_markdown_dataset(self, list_md_files: list[Path], **kwargs) -> Dataset:
801
807
  chunks_mds = []
802
808
  for md_file in list_md_files:
803
809
  with open(md_file, "r", encoding="utf-8") as f:
@@ -811,5 +817,5 @@ class DocProcessor:
811
817
  }
812
818
  )
813
819
  chunk_ds = Dataset.from_list(chunks_mds)
814
- chunk_ds_with_icls = self._add_icls(chunk_ds)
820
+ chunk_ds_with_icls = self._add_icls(chunk_ds, **kwargs)
815
821
  return chunk_ds_with_icls
@@ -332,7 +332,7 @@
332
332
  " LLMChatBlock,\n",
333
333
  " PromptBuilderBlock,\n",
334
334
  " TextParserBlock,\n",
335
- " LLMParserBlock,\n",
335
+ " LLMResponseExtractorBlock,\n",
336
336
  ")\n",
337
337
  "from sdg_hub.core.blocks.transform import JSONStructureBlock\n",
338
338
  "\n",
@@ -355,7 +355,7 @@
355
355
  " temperature=0.1, # Low temperature for more consistent extraction\n",
356
356
  ")\n",
357
357
  "\n",
358
- "ticker_llm_parser_block = LLMParserBlock(\n",
358
+ "ticker_llm_response_extractor_block = LLMResponseExtractorBlock(\n",
359
359
  " block_name=\"extract_stock_tickers\",\n",
360
360
  " input_cols=[\"raw_stock_tickers\"],\n",
361
361
  " extract_content=True,\n",
@@ -406,7 +406,7 @@
406
406
  "ticker_blocks = [\n",
407
407
  " ticker_prompt_block,\n",
408
408
  " ticker_llm_block,\n",
409
- " ticker_llm_parser_block,\n",
409
+ " ticker_llm_response_extractor_block,\n",
410
410
  " ticker_parser_block,\n",
411
411
  " enhanced_json_block,\n",
412
412
  "]\n",
@@ -33,7 +33,7 @@ dependencies = [
33
33
  "datasets>=4.0.0",
34
34
  "httpx>=0.25.0,<1.0.0",
35
35
  "jinja2",
36
- "litellm>=1.73.0,<1.75.0",
36
+ "litellm>=1.73.0,<2.0.0", # raising cap since tests run without errors related to 'backoff' cap back to <1.75.0 if errors surface
37
37
  "rich",
38
38
  "pandas",
39
39
  "pydantic>=2.0.0,<3.0.0", # cap before v3; adjust the lower bound to the minimum v2.x you’ve tested
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.7.1'
32
- __version_tuple__ = version_tuple = (0, 7, 1)
31
+ __version__ = version = '0.7.3'
32
+ __version_tuple__ = version_tuple = (0, 7, 3)
33
33
 
34
- __commit_id__ = commit_id = 'g884bce940'
34
+ __commit_id__ = commit_id = 'g97824a47f'
@@ -6,7 +6,13 @@ This package provides various block implementations for data generation, process
6
6
  # Local
7
7
  from .base import BaseBlock
8
8
  from .filtering import ColumnValueFilterBlock
9
- from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
9
+ from .llm import (
10
+ LLMChatBlock,
11
+ LLMParserBlock,
12
+ LLMResponseExtractorBlock,
13
+ PromptBuilderBlock,
14
+ TextParserBlock,
15
+ )
10
16
  from .registry import BlockRegistry
11
17
  from .transform import (
12
18
  DuplicateColumnsBlock,
@@ -28,7 +34,8 @@ __all__ = [
28
34
  "TextConcatBlock",
29
35
  "UniformColumnValueSetter",
30
36
  "LLMChatBlock",
31
- "LLMParserBlock",
37
+ "LLMParserBlock", # Deprecated alias for LLMResponseExtractorBlock
38
+ "LLMResponseExtractorBlock",
32
39
  "TextParserBlock",
33
40
  "PromptBuilderBlock",
34
41
  ]
@@ -49,6 +49,9 @@ class BaseBlock(BaseModel, ABC):
49
49
  block_name: str = Field(
50
50
  ..., description="Unique identifier for this block instance"
51
51
  )
52
+ block_type: Optional[str] = Field(
53
+ None, description="Block type (e.g., 'llm', 'transform', 'parser', 'filtering')"
54
+ )
52
55
  input_cols: Union[str, list[str], dict[str, Any], None] = Field(
53
56
  None, description="Input columns: str, list, or dict"
54
57
  )
@@ -366,5 +369,5 @@ class BaseBlock(BaseModel, ABC):
366
369
  Dict[str, Any]
367
370
  """
368
371
  config = self.get_config()
369
- config["block_type"] = self.__class__.__name__
372
+ config["block_class"] = self.__class__.__name__
370
373
  return config
@@ -46,6 +46,8 @@ DTYPE_MAP = {
46
46
  "Filters datasets based on column values using various comparison operations",
47
47
  )
48
48
  class ColumnValueFilterBlock(BaseBlock):
49
+ block_type: str = "filtering"
50
+
49
51
  """A block for filtering datasets based on column values.
50
52
 
51
53
  This block allows filtering of datasets using various operations (e.g., equals, contains)
@@ -9,7 +9,7 @@ local models (vLLM, Ollama), and more.
9
9
  # Local
10
10
  from .error_handler import ErrorCategory, LLMErrorHandler
11
11
  from .llm_chat_block import LLMChatBlock
12
- from .llm_parser_block import LLMParserBlock
12
+ from .llm_response_extractor_block import LLMParserBlock, LLMResponseExtractorBlock
13
13
  from .prompt_builder_block import PromptBuilderBlock
14
14
  from .text_parser_block import TextParserBlock
15
15
 
@@ -17,7 +17,8 @@ __all__ = [
17
17
  "LLMErrorHandler",
18
18
  "ErrorCategory",
19
19
  "LLMChatBlock",
20
- "LLMParserBlock",
20
+ "LLMParserBlock", # Deprecated alias for LLMResponseExtractorBlock
21
+ "LLMResponseExtractorBlock",
21
22
  "PromptBuilderBlock",
22
23
  "TextParserBlock",
23
24
  ]
@@ -6,7 +6,8 @@ from typing import Any, Optional
6
6
  import asyncio
7
7
 
8
8
  from litellm import acompletion, completion
9
- from pydantic import ConfigDict, Field, field_validator
9
+ from pydantic import ConfigDict, Field, SecretStr, field_validator
10
+ from tqdm.asyncio import tqdm_asyncio
10
11
  import litellm
11
12
 
12
13
  # Third Party
@@ -31,6 +32,8 @@ logger = setup_logger(__name__)
31
32
  class LLMChatBlock(BaseBlock):
32
33
  model_config = ConfigDict(extra="allow")
33
34
 
35
+ block_type: str = "llm"
36
+
34
37
  """Unified LLM chat block supporting all providers via LiteLLM.
35
38
 
36
39
  This block provides a minimal wrapper around LiteLLM's completion API,
@@ -52,8 +55,9 @@ class LLMChatBlock(BaseBlock):
52
55
  model : Optional[str], optional
53
56
  Model identifier in LiteLLM format. Can be set later via flow.set_model_config().
54
57
  Examples: "openai/gpt-4", "anthropic/claude-3-sonnet-20240229"
55
- api_key : Optional[str], optional
58
+ api_key : Optional[SecretStr], optional
56
59
  API key for the provider. Falls back to environment variables.
60
+ Automatically redacted in logs and string representations.
57
61
  api_base : Optional[str], optional
58
62
  Base URL for the API. Required for local models.
59
63
  async_mode : bool, optional
@@ -97,7 +101,7 @@ class LLMChatBlock(BaseBlock):
97
101
  model: Optional[str] = Field(
98
102
  None, exclude=True, description="Model identifier in LiteLLM format"
99
103
  )
100
- api_key: Optional[str] = Field(
104
+ api_key: Optional[SecretStr] = Field(
101
105
  None, exclude=True, description="API key for the provider"
102
106
  )
103
107
  api_base: Optional[str] = Field(
@@ -301,7 +305,7 @@ class LLMChatBlock(BaseBlock):
301
305
  if self.model is not None:
302
306
  completion_kwargs["model"] = self.model
303
307
  if self.api_key is not None:
304
- completion_kwargs["api_key"] = self.api_key
308
+ completion_kwargs["api_key"] = self.api_key.get_secret_value()
305
309
  if self.api_base is not None:
306
310
  completion_kwargs["api_base"] = self.api_base
307
311
  if self.timeout is not None:
@@ -501,7 +505,9 @@ class LLMChatBlock(BaseBlock):
501
505
  for messages in messages_list
502
506
  ]
503
507
 
504
- responses = await asyncio.gather(*tasks)
508
+ responses = await tqdm_asyncio.gather(
509
+ *tasks, desc=self.block_name, unit="req"
510
+ )
505
511
  return responses
506
512
 
507
513
  except Exception as e:
@@ -1,7 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
- """LLM parser block for extracting fields from LLM response objects.
2
+ """LLM response extractor block for extracting fields from LLM response objects.
3
3
 
4
- This module provides the LLMParserBlock for extracting specific fields
4
+ This module provides the LLMResponseExtractorBlock for extracting specific fields
5
5
  (content, reasoning_content, tool_calls) from chat completion response objects.
6
6
  """
7
7
 
@@ -22,13 +22,15 @@ logger = setup_logger(__name__)
22
22
 
23
23
 
24
24
  @BlockRegistry.register(
25
- "LLMParserBlock",
25
+ "LLMResponseExtractorBlock",
26
26
  "llm",
27
27
  "Extracts specified fields from LLM response objects",
28
28
  )
29
- class LLMParserBlock(BaseBlock):
29
+ class LLMResponseExtractorBlock(BaseBlock):
30
30
  _flow_requires_jsonl_tmp: bool = True
31
31
 
32
+ block_type: str = "llm_util"
33
+
32
34
  """Block for extracting fields from LLM response objects.
33
35
 
34
36
  This block extracts specified fields from chat completion response objects.
@@ -88,7 +90,7 @@ class LLMParserBlock(BaseBlock):
88
90
  ]
89
91
  ):
90
92
  raise ValueError(
91
- "LLMParserBlock requires at least one extraction field to be enabled: "
93
+ "LLMResponseExtractorBlock requires at least one extraction field to be enabled: "
92
94
  "extract_content, extract_reasoning_content, or extract_tool_calls"
93
95
  )
94
96
 
@@ -106,7 +108,7 @@ class LLMParserBlock(BaseBlock):
106
108
  return self
107
109
 
108
110
  def _validate_custom(self, dataset: pd.DataFrame) -> None:
109
- """Validate LLMParserBlock specific requirements.
111
+ """Validate LLMResponseExtractorBlock specific requirements.
110
112
 
111
113
  Parameters
112
114
  ----------
@@ -116,14 +118,16 @@ class LLMParserBlock(BaseBlock):
116
118
  Raises
117
119
  ------
118
120
  ValueError
119
- If LLMParserBlock requirements are not met.
121
+ If LLMResponseExtractorBlock requirements are not met.
120
122
  """
121
123
  # Validate that we have exactly one input column
122
124
  if len(self.input_cols) == 0:
123
- raise ValueError("LLMParserBlock expects at least one input column")
125
+ raise ValueError(
126
+ "LLMResponseExtractorBlock expects at least one input column"
127
+ )
124
128
  if len(self.input_cols) > 1:
125
129
  logger.warning(
126
- f"LLMParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
130
+ f"LLMResponseExtractorBlock expects exactly one input column, but got {len(self.input_cols)}. "
127
131
  f"Using the first column: {self.input_cols[0]}"
128
132
  )
129
133
 
@@ -324,3 +328,22 @@ class LLMParserBlock(BaseBlock):
324
328
  new_data.extend(self._generate(sample))
325
329
 
326
330
  return pd.DataFrame(new_data)
331
+
332
+
333
+ # Backwards compatibility alias (deprecated)
334
+ # Register deprecated alias in BlockRegistry so old YAML flows still work
335
+ @BlockRegistry.register(
336
+ "LLMParserBlock",
337
+ "llm",
338
+ "Deprecated: Use LLMResponseExtractorBlock instead",
339
+ deprecated=True,
340
+ replacement="LLMResponseExtractorBlock",
341
+ )
342
+ class LLMParserBlock(LLMResponseExtractorBlock):
343
+ """Deprecated alias for LLMResponseExtractorBlock.
344
+
345
+ This class exists for backwards compatibility with existing code and YAML flows.
346
+ Use LLMResponseExtractorBlock instead.
347
+ """
348
+
349
+ pass
@@ -222,6 +222,8 @@ class PromptRenderer:
222
222
  "Formats prompts into structured chat messages or plain text using Jinja templates",
223
223
  )
224
224
  class PromptBuilderBlock(BaseBlock):
225
+ block_type: str = "llm_util"
226
+
225
227
  """Block for formatting prompts into structured chat messages or plain text.
226
228
 
227
229
  This block takes input from dataset columns, applies Jinja templates from a YAML config
@@ -30,6 +30,8 @@ logger = setup_logger(__name__)
30
30
  class TextParserBlock(BaseBlock):
31
31
  _flow_requires_jsonl_tmp: bool = True
32
32
 
33
+ block_type: str = "parser"
34
+
33
35
  """Block for parsing and post-processing text content.
34
36
 
35
37
  This block handles text parsing using start/end tags, custom regex patterns,
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
27
27
  "Duplicates existing columns with new names according to a mapping specification",
28
28
  )
29
29
  class DuplicateColumnsBlock(BaseBlock):
30
+ block_type: str = "transform"
31
+
30
32
  """Block for duplicating existing columns with new names.
31
33
 
32
34
  This block creates copies of existing columns with new names according to a mapping specification.
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
28
28
  "Maps values from source columns to output columns based on choice columns using shared mapping",
29
29
  )
30
30
  class IndexBasedMapperBlock(BaseBlock):
31
+ block_type: str = "transform"
32
+
31
33
  """Block for mapping values from source columns to output columns based on choice columns.
32
34
 
33
35
  This block uses a shared mapping dictionary to select values from source columns and
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
28
28
  "Combines multiple columns into a single column containing a structured JSON object",
29
29
  )
30
30
  class JSONStructureBlock(BaseBlock):
31
+ block_type: str = "transform"
32
+
31
33
  """Block for combining multiple columns into a structured JSON object.
32
34
 
33
35
  This block takes values from multiple input columns and combines them into a single
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
28
28
  "Transforms wide dataset format into long format by melting columns into rows",
29
29
  )
30
30
  class MeltColumnsBlock(BaseBlock):
31
+ block_type: str = "transform"
32
+
31
33
  """Block for flattening multiple columns into a long format.
32
34
 
33
35
  This block transforms a wide dataset format into a long format by melting
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
27
27
  "Renames columns in a dataset according to a mapping specification",
28
28
  )
29
29
  class RenameColumnsBlock(BaseBlock):
30
+ block_type: str = "transform"
31
+
30
32
  """Block for renaming columns in a dataset.
31
33
 
32
34
  This block renames columns in a dataset according to a mapping specification.
@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
27
27
  "Combines multiple columns into a single column using a specified separator",
28
28
  )
29
29
  class TextConcatBlock(BaseBlock):
30
+ block_type: str = "transform"
31
+
30
32
  """Block for combining multiple columns into a single column.
31
33
 
32
34
  This block concatenates values from multiple columns into a single output column,
@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
28
28
  "Replaces all values in a column with a single summary statistic (e.g., mode, mean, median)",
29
29
  )
30
30
  class UniformColumnValueSetter(BaseBlock):
31
+ block_type: str = "transform"
32
+
31
33
  """Block that replaces all values in a column with a single aggregate value.
32
34
 
33
35
  Supported strategies include: mode, min, max, mean, median.