sdg-hub 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. sdg_hub-0.4.2/.github/workflows/integration-test.yml +143 -0
  2. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/test.yml +6 -3
  3. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.gitignore +5 -0
  4. {sdg_hub-0.4.0/src/sdg_hub.egg-info → sdg_hub-0.4.2}/PKG-INFO +2 -1
  5. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/blocks/llm-blocks.md +236 -2
  6. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/concepts.md +14 -1
  7. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/flows/discovery.md +38 -1
  8. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/flows/overview.md +215 -12
  9. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/quick-start.md +31 -4
  10. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/annotation/news_classification_flow.yaml +0 -5
  11. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/.env.example +4 -1
  12. sdg_hub-0.4.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/document_pre_processing.ipynb +214 -0
  13. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_generation.ipynb +72 -53
  14. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing.ipynb +57 -237
  15. sdg_hub-0.4.2/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/raft_builder.py +259 -0
  16. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_pre_processing.ipynb +2 -2
  17. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/pyproject.toml +8 -2
  18. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/__init__.py +0 -2
  19. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/_version.py +3 -3
  20. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/__init__.py +1 -2
  21. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/__init__.py +3 -4
  22. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/base.py +143 -71
  23. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/metadata.py +1 -68
  24. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/registry.py +0 -1
  25. sdg_hub-0.4.2/src/sdg_hub/core/utils/__init__.py +21 -0
  26. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_metrics.py +116 -0
  27. sdg_hub-0.4.2/src/sdg_hub/core/utils/time_estimator.py +344 -0
  28. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +0 -1
  29. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/flow.yaml +0 -1
  30. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +0 -1
  31. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +0 -1
  32. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -2
  33. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +11 -10
  34. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/flow.yaml +0 -1
  35. {sdg_hub-0.4.0 → sdg_hub-0.4.2/src/sdg_hub.egg-info}/PKG-INFO +2 -1
  36. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/SOURCES.txt +13 -0
  37. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/requires.txt +1 -0
  38. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_base.py +76 -48
  39. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_integration.py +0 -32
  40. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_metadata.py +1 -73
  41. sdg_hub-0.4.2/tests/flow/test_time_estimation.py +546 -0
  42. sdg_hub-0.4.2/tests/integration/README.md +95 -0
  43. sdg_hub-0.4.2/tests/integration/__init__.py +3 -0
  44. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +62 -0
  45. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/__init__.py +1 -0
  46. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/conftest.py +64 -0
  47. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_data/test_seed_data.jsonl +1 -0
  48. sdg_hub-0.4.2/tests/integration/knowledge_tuning/enhanced_summary_knowledge_tuning/test_functional.py +114 -0
  49. sdg_hub-0.4.2/tests/utils/test_flow_metrics.py +477 -0
  50. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tox.ini +31 -3
  51. sdg_hub-0.4.0/src/sdg_hub/core/utils/__init__.py +0 -13
  52. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/actionlint.yaml +0 -0
  53. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/actions/free-disk-space/action.yml +0 -0
  54. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/dependabot.yml +0 -0
  55. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/mergify.yml +0 -0
  56. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/actionlint.dockerfile +0 -0
  57. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/actionlint.yml +0 -0
  58. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/docs.yml +0 -0
  59. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/e2e.yml +0 -0
  60. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/lint.yml +0 -0
  61. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/matchers/actionlint.json +0 -0
  62. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/matchers/pylint.json +0 -0
  63. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/packer.yml +0 -0
  64. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.github/workflows/pypi.yaml +0 -0
  65. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.isort.cfg +0 -0
  66. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.markdownlint-cli2.yaml +0 -0
  67. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.pre-commit-config.yaml +0 -0
  68. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/.pylintrc +0 -0
  69. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/CLAUDE.md +0 -0
  70. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/CONTRIBUTING.md +0 -0
  71. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/LICENSE +0 -0
  72. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/Makefile +0 -0
  73. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/README.md +0 -0
  74. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/.nojekyll +0 -0
  75. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/README.md +0 -0
  76. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/_coverpage.md +0 -0
  77. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/_navbar.md +0 -0
  78. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/_sidebar.md +0 -0
  79. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/api-reference.md +0 -0
  80. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/blocks/custom-blocks.md +0 -0
  81. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/blocks/filtering-blocks.md +0 -0
  82. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/blocks/overview.md +0 -0
  83. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/blocks/transform-blocks.md +0 -0
  84. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/development.md +0 -0
  85. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/index.html +0 -0
  86. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/docs/installation.md +0 -0
  87. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/annotation/annotation_classification.ipynb +0 -0
  88. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/annotation/news_classification_assessment_prompt.yaml +0 -0
  89. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/annotation/news_classification_prompt.yaml +0 -0
  90. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/annotation/revise_news_classification_prompt.yaml +0 -0
  91. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/README.md +0 -0
  92. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/enhanced_summary_knowledge_tuning/knowledge_mixing_utils.py +0 -0
  93. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/.gitignore +0 -0
  94. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/README.md +0 -0
  95. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/assets/imgs/instructlab-banner.png +0 -0
  96. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docling_v2_config.yaml +0 -0
  97. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docparser.py +0 -0
  98. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/docparser_v2.py +0 -0
  99. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.json +0 -0
  100. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.md +0 -0
  101. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/ibm-annual-report-2024.pdf +0 -0
  102. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/document_collection/ibm-annual-report/qna.yaml +0 -0
  103. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/knowledge_generation_and_mixing.ipynb +0 -0
  104. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/instructlab/logger_config.py +0 -0
  105. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/knowledge_tuning/knowledge_utils.py +0 -0
  106. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/text_analysis/README.md +0 -0
  107. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/text_analysis/extract_stock_tickers.yaml +0 -0
  108. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/examples/text_analysis/structured_insights_demo.ipynb +0 -0
  109. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/scripts/ruff.sh +0 -0
  110. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/setup.cfg +0 -0
  111. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/__init__.py +0 -0
  112. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/base.py +0 -0
  113. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -0
  114. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -0
  115. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -0
  116. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -0
  117. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -0
  118. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -0
  119. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -0
  120. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -0
  121. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -0
  122. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -0
  123. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/filtering/__init__.py +0 -0
  124. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/filtering/column_value_filter.py +0 -0
  125. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/__init__.py +0 -0
  126. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/error_handler.py +0 -0
  127. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_chat_block.py +0 -0
  128. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -0
  129. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/llm_parser_block.py +0 -0
  130. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/prompt_builder_block.py +0 -0
  131. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/llm/text_parser_block.py +0 -0
  132. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/registry.py +0 -0
  133. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/__init__.py +0 -0
  134. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/duplicate_columns.py +0 -0
  135. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/index_based_mapper.py +0 -0
  136. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/json_structure_block.py +0 -0
  137. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/melt_columns.py +0 -0
  138. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/rename_columns.py +0 -0
  139. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/text_concat.py +0 -0
  140. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/blocks/transform/uniform_col_val_setter.py +0 -0
  141. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/checkpointer.py +0 -0
  142. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/migration.py +0 -0
  143. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/flow/validation.py +0 -0
  144. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/datautils.py +0 -0
  145. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/error_handling.py +0 -0
  146. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_id_words.yaml +0 -0
  147. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/flow_identifier.py +0 -0
  148. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/logger_config.py +0 -0
  149. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/path_resolution.py +0 -0
  150. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/core/utils/yaml_utils.py +0 -0
  151. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py +0 -0
  152. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py +0 -0
  153. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml +0 -0
  154. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/doc_direct_qa/__init__.py +0 -0
  155. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py +0 -0
  156. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml +0 -0
  157. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml +0 -0
  158. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml +0 -0
  159. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml +0 -0
  160. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py +0 -0
  161. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml +0 -0
  162. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md +0 -0
  163. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py +0 -0
  164. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +0 -0
  165. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +0 -0
  166. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +0 -0
  167. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +0 -0
  168. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +0 -0
  169. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +0 -0
  170. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +0 -0
  171. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/README.md +0 -0
  172. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/__init__.py +0 -0
  173. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml +0 -0
  174. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml +0 -0
  175. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml +0 -0
  176. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml +0 -0
  177. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/__init__.py +0 -0
  178. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/__init__.py +0 -0
  179. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml +0 -0
  180. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml +0 -0
  181. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml +0 -0
  182. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/flows/text_analysis/structured_insights/summarize.yaml +0 -0
  183. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub/py.typed +0 -0
  184. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/dependency_links.txt +0 -0
  185. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/src/sdg_hub.egg-info/top_level.txt +0 -0
  186. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/__init__.py +0 -0
  187. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/deprecated/test_llmblock.py +0 -0
  188. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/filtering/test_columnvaluefilter.py +0 -0
  189. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_chat_block.py +0 -0
  190. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_chat_with_parsing_retry_block.py +0 -0
  191. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/llm/test_llm_parser_block.py +0 -0
  192. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/llm/test_promptbuilderblock.py +0 -0
  193. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/llm/test_textparserblock.py +0 -0
  194. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/test_base_block.py +0 -0
  195. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/test_registry.py +0 -0
  196. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_config.yaml +0 -0
  197. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_config.yaml +0 -0
  198. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_no_system.yaml +0 -0
  199. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_format_strict.yaml +0 -0
  200. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_invalid_final_role.yaml +0 -0
  201. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/testdata/test_prompt_no_user_messages.yaml +0 -0
  202. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/transform/test_index_based_mapper.py +0 -0
  203. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/transform/test_json_structure_block.py +0 -0
  204. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/transform/test_melt_columns.py +0 -0
  205. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/transform/test_text_concat.py +0 -0
  206. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/transform/test_uniform_col_val_setter.py +0 -0
  207. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_combinecolumns.py +0 -0
  208. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_duplicatecolumnsblock.py +0 -0
  209. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_flattenblock.py +0 -0
  210. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_renameblock.py +0 -0
  211. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_samplepopulatorblock.py +0 -0
  212. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_selectorblock.py +0 -0
  213. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/blocks/utilblocks/test_settomajority.py +0 -0
  214. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/__init__.py +0 -0
  215. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/conftest.py +0 -0
  216. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_checkpointer.py +0 -0
  217. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_dataset_requirements.py +0 -0
  218. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_migration.py +0 -0
  219. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_registry.py +0 -0
  220. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/flow/test_validation.py +0 -0
  221. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/utils/test_datautils.py +0 -0
  222. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/utils/test_error_handling.py +0 -0
  223. {sdg_hub-0.4.0 → sdg_hub-0.4.2}/tests/utils/test_path_resolution.py +0 -0
@@ -0,0 +1,143 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ name: Integration Test
4
+ on:
5
+ workflow_dispatch:
6
+ push:
7
+ branches:
8
+ - "main"
9
+ - "release-**"
10
+ paths:
11
+ # Only trigger on changes to relevant flows and examples (EXTEND THIS):
12
+ - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
13
+ - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
14
+ # Standard integration test triggers, DONT CHANGE THIS
15
+ - 'tests/integration/**/*.py'
16
+ - 'pyproject.toml'
17
+ - 'tox.ini'
18
+ - '.github/workflows/integration-test.yml'
19
+ pull_request:
20
+ branches:
21
+ - "main"
22
+ - "release-**"
23
+ types: [opened, synchronize, reopened, labeled]
24
+ paths:
25
+ # Only trigger on changes to relevant flows and examples (EXTEND THIS):
26
+ - 'src/sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/**'
27
+ - 'examples/knowledge_tuning/enhanced_summary_knowledge_tuning/**'
28
+ # Standard integration test triggers, DONT CHANGE THIS
29
+ - 'tests/integration/**/*.py'
30
+ - 'pyproject.toml'
31
+ - 'tox.ini'
32
+ - '.github/workflows/integration-test.yml'
33
+
34
+ env:
35
+ LC_ALL: en_US.UTF-8
36
+
37
+ defaults:
38
+ run:
39
+ shell: bash
40
+
41
+ permissions:
42
+ contents: read
43
+
44
+ jobs:
45
+ integration-test:
46
+ name: "Integration Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
47
+ runs-on: "${{ matrix.platform }}"
48
+ # Require manual approval before running (via GitHub Environment)
49
+ environment: integration-tests
50
+ # Skip fork PRs (they can't access environment secrets anyway)
51
+ # Also check for 'run-integration-tests' label on labeled events
52
+ if: |
53
+ github.event_name == 'workflow_dispatch' ||
54
+ github.event_name == 'push' ||
55
+ (github.event_name == 'pull_request' &&
56
+ github.event.pull_request.head.repo.full_name == github.repository &&
57
+ (github.event.action != 'labeled' || contains(github.event.pull_request.labels.*.name, 'run-integration-tests')))
58
+ strategy:
59
+ matrix:
60
+ python:
61
+ - "3.11"
62
+ platform:
63
+ - "ubuntu-latest"
64
+ steps:
65
+ - name: "Harden Runner"
66
+ uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
67
+ with:
68
+ egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
69
+
70
+ - name: Checkout
71
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
72
+ with:
73
+ # https://github.com/actions/checkout/issues/249
74
+ fetch-depth: 0
75
+
76
+ - name: Free disk space
77
+ uses: ./.github/actions/free-disk-space
78
+
79
+ - name: Install the expect package
80
+ run: |
81
+ sudo apt-get install -y expect
82
+
83
+ - name: Setup Python ${{ matrix.python }}
84
+ uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
85
+ with:
86
+ python-version: ${{ matrix.python }}
87
+ cache: pip
88
+ cache-dependency-path: |
89
+ **/pyproject.toml
90
+ **/requirements*.txt
91
+
92
+ - name: Remove llama-cpp-python from cache
93
+ run: |
94
+ pip cache remove llama_cpp_python
95
+
96
+ - name: Cache huggingface datasets
97
+ uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
98
+ with:
99
+ path: ~/.cache/huggingface
100
+ # Invalidate cache when any example notebook changes (may affect dataset downloads)
101
+ key: huggingface-${{ hashFiles('examples/**/*.ipynb') }}
102
+
103
+ - name: Install dependencies
104
+ run: |
105
+ python -m pip install --upgrade pip
106
+ python -m pip install tox tox-gh>=1.2
107
+
108
+ - name: Run integration tests with tox
109
+ env:
110
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
111
+ run: |
112
+ tox -e py3-integrationcov
113
+
114
+ - name: Remove llama-cpp-python from cache
115
+ if: always()
116
+ run: |
117
+ pip cache remove llama_cpp_python
118
+
119
+ - name: Upload integration test coverage to Codecov
120
+ uses: codecov/codecov-action@v4
121
+ with:
122
+ token: ${{ secrets.CODECOV_TOKEN }}
123
+ file: ./coverage-py3-integrationcov.xml
124
+ fail_ci_if_error: false
125
+ flags: integration
126
+
127
+ - name: Upload integration test artifacts
128
+ uses: actions/upload-artifact@v4
129
+ if: always()
130
+ with:
131
+ name: integration-test-results-${{ matrix.python }}-${{ matrix.platform }}
132
+ path: |
133
+ coverage-py3-integrationcov/
134
+ coverage-py3-integrationcov.xml
135
+ durations/py3-integrationcov.html
136
+ retention-days: 30
137
+
138
+ integration-test-workflow-complete:
139
+ needs: ["integration-test"]
140
+ runs-on: ubuntu-latest
141
+ steps:
142
+ - name: Integration Test Workflow Complete
143
+ run: echo "Integration Test Workflow Complete"
@@ -9,7 +9,8 @@ on:
9
9
  - "main"
10
10
  - "release-**"
11
11
  paths:
12
- - '**.py'
12
+ - 'src/**/*.py'
13
+ - 'tests/**/*.py'
13
14
  - 'pyproject.toml'
14
15
  - 'requirements*.txt'
15
16
  - 'tox.ini'
@@ -19,7 +20,8 @@ on:
19
20
  - "main"
20
21
  - "release-**"
21
22
  paths:
22
- - '**.py'
23
+ - 'src/**/*.py'
24
+ - 'tests/**/*.py'
23
25
  - 'pyproject.toml'
24
26
  - 'requirements*.txt'
25
27
  - 'tox.ini'
@@ -37,7 +39,7 @@ permissions:
37
39
 
38
40
  jobs:
39
41
  test:
40
- name: "${{ matrix.python }} on ${{ matrix.platform }}"
42
+ name: "Unit Tests - ${{ matrix.python }} on ${{ matrix.platform }}"
41
43
  runs-on: "${{ matrix.platform }}"
42
44
  strategy:
43
45
  matrix:
@@ -104,6 +106,7 @@ jobs:
104
106
  run: |
105
107
  tox -e py3-unitcov
106
108
 
109
+
107
110
  - name: Remove llama-cpp-python from cache
108
111
  if: always()
109
112
  run: |
@@ -84,6 +84,11 @@ target/
84
84
  # Jupyter Notebook
85
85
  .ipynb_checkpoints
86
86
 
87
+ # Integration test artifacts
88
+ tests/integration/**/converted_scripts/
89
+ tests/integration/**/test_output/
90
+ tests/integration/**/output_data/
91
+
87
92
  # IPython
88
93
  profile_default/
89
94
  ipython_config.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -65,6 +65,7 @@ Requires-Dist: pytest-html; extra == "dev"
65
65
  Requires-Dist: tox<5,>=4.4.2; extra == "dev"
66
66
  Requires-Dist: ruff; extra == "dev"
67
67
  Requires-Dist: pytest-env; extra == "dev"
68
+ Requires-Dist: nbconvert>=7.0.0; extra == "dev"
68
69
  Dynamic: license-file
69
70
 
70
71
  # `sdg_hub`: Synthetic Data Generation Toolkit
@@ -230,10 +230,244 @@ Constructs prompts from templates and data with validation and formatting suppor
230
230
 
231
231
  ## 🔍 TextParserBlock
232
232
 
233
- Extracts structured data from LLM responses using patterns, schemas, or custom parsers.
233
+ Extracts structured data from LLM responses using tag-based parsing or custom regex patterns. Essential for parsing LLM outputs into structured fields.
234
234
 
235
- #TODO: Add text parser block example
235
+ ### Basic Tag-Based Parsing
236
236
 
237
+ Extract content between start and end tags:
238
+
239
+ ```python
240
+ from sdg_hub.core.blocks import TextParserBlock
241
+ from datasets import Dataset
242
+
243
+ # Single field extraction
244
+ parser = TextParserBlock(
245
+ block_name="extract_answer",
246
+ input_cols=["llm_response"],
247
+ output_cols=["answer"],
248
+ start_tags=["<answer>"],
249
+ end_tags=["</answer>"]
250
+ )
251
+
252
+ dataset = Dataset.from_dict({
253
+ "llm_response": [
254
+ "Question analysis: ...\n<answer>Machine learning is a subset of AI.</answer>",
255
+ "Let me think...\n<answer>Neural networks process data in layers.</answer>"
256
+ ]
257
+ })
258
+
259
+ result = parser.generate(dataset)
260
+ print(result["answer"])
261
+ # ['Machine learning is a subset of AI.', 'Neural networks process data in layers.']
262
+ ```
263
+
264
+ ### Multiple Field Extraction
265
+
266
+ Extract multiple structured fields from a single response:
267
+
268
+ ```python
269
+ # Extract multiple fields with tag pairs
270
+ parser = TextParserBlock(
271
+ block_name="extract_qa",
272
+ input_cols=["llm_response"],
273
+ output_cols=["question", "answer", "confidence"],
274
+ start_tags=["<question>", "<answer>", "<confidence>"],
275
+ end_tags=["</question>", "</answer>", "</confidence>"]
276
+ )
277
+
278
+ dataset = Dataset.from_dict({
279
+ "llm_response": [
280
+ """
281
+ <question>What is Python?</question>
282
+ <answer>Python is a high-level programming language.</answer>
283
+ <confidence>0.95</confidence>
284
+ """
285
+ ]
286
+ })
287
+
288
+ result = parser.generate(dataset)
289
+ print(result["question"]) # ['What is Python?']
290
+ print(result["answer"]) # ['Python is a high-level programming language.']
291
+ print(result["confidence"]) # ['0.95']
292
+ ```
293
+
294
+ ### Custom Regex Parsing
295
+
296
+ Use regex patterns for flexible extraction:
297
+
298
+ ```python
299
+ # Extract using regex pattern
300
+ parser = TextParserBlock(
301
+ block_name="regex_parser",
302
+ input_cols=["llm_response"],
303
+ output_cols=["answer"],
304
+ parsing_pattern=r"Answer:\s*(.+?)(?:\n|$)"
305
+ )
306
+
307
+ dataset = Dataset.from_dict({
308
+ "llm_response": [
309
+ "Question: What is AI?\nAnswer: Artificial Intelligence is...\n",
310
+ "Let me answer:\nAnswer: Machine learning enables..."
311
+ ]
312
+ })
313
+
314
+ result = parser.generate(dataset)
315
+ print(result["answer"])
316
+ # ['Artificial Intelligence is...', 'Machine learning enables...']
317
+ ```
318
+
319
+ ### Tag Cleanup
320
+
321
+ Remove unwanted tags from extracted content:
322
+
323
+ ```python
324
+ # Clean up markdown and code tags
325
+ parser = TextParserBlock(
326
+ block_name="clean_parser",
327
+ input_cols=["llm_response"],
328
+ output_cols=["clean_answer"],
329
+ start_tags=["<answer>"],
330
+ end_tags=["</answer>"],
331
+ parser_cleanup_tags=["```", "###", "**"]
332
+ )
333
+
334
+ dataset = Dataset.from_dict({
335
+ "llm_response": [
336
+ "<answer>Here's the code: ```python\nprint('hello')```</answer>",
337
+ "<answer>**Important**: This is the ### answer</answer>"
338
+ ]
339
+ })
340
+
341
+ result = parser.generate(dataset)
342
+ print(result["clean_answer"])
343
+ # ['Here\'s the code: python\nprint(\'hello\')', 'Important: This is the answer']
344
+ ```
345
+
346
+ ### Handling Multiple Matches
347
+
348
+ Extract all occurrences of a pattern:
349
+
350
+ ```python
351
+ parser = TextParserBlock(
352
+ block_name="multi_extract",
353
+ input_cols=["llm_response"],
354
+ output_cols=["keywords"],
355
+ start_tags=["[KEY]"],
356
+ end_tags=["[/KEY]"]
357
+ )
358
+
359
+ dataset = Dataset.from_dict({
360
+ "llm_response": [
361
+ "Important terms: [KEY]machine learning[/KEY], [KEY]neural networks[/KEY], [KEY]deep learning[/KEY]"
362
+ ]
363
+ })
364
+
365
+ result = parser.generate(dataset)
366
+ print(result["keywords"])
367
+ # [['machine learning', 'neural networks', 'deep learning']]
368
+ ```
369
+
370
+ ### Practical Example: Evaluation Response Parsing
371
+
372
+ Common pattern for parsing LLM evaluation responses:
373
+
374
+ ```python
375
+ # Parse structured evaluation output
376
+ evaluation_parser = TextParserBlock(
377
+ block_name="parse_evaluation",
378
+ input_cols=["evaluation_response"],
379
+ output_cols=["explanation", "judgment"],
380
+ start_tags=["[Start of Explanation]", "[Start of Answer]"],
381
+ end_tags=["[End of Explanation]", "[End of Answer]"],
382
+ parser_cleanup_tags=["```", "###"]
383
+ )
384
+
385
+ dataset = Dataset.from_dict({
386
+ "evaluation_response": [
387
+ """
388
+ [Start of Explanation]
389
+ The response accurately reflects the information in the document.
390
+ No hallucinations or contradictions were found.
391
+ [End of Explanation]
392
+
393
+ [Start of Answer]
394
+ YES
395
+ [End of Answer]
396
+ """
397
+ ]
398
+ })
399
+
400
+ result = evaluation_parser.generate(dataset)
401
+ print(result["explanation"]) # ['The response accurately reflects...']
402
+ print(result["judgment"]) # ['YES']
403
+ ```
404
+
405
+ ### Integration with LLMChatBlock
406
+
407
+ TextParserBlock is commonly used after LLMChatBlock to structure responses:
408
+
409
+ ```python
410
+ from sdg_hub.core.blocks import LLMChatBlock, LLMParserBlock, TextParserBlock
411
+
412
+ # Step 1: Generate LLM response
413
+ chat_block = LLMChatBlock(
414
+ block_name="evaluator",
415
+ model="openai/gpt-4o",
416
+ input_cols=["messages"],
417
+ output_cols=["eval_response"]
418
+ )
419
+
420
+ # Step 2: Extract content from response object
421
+ # Use field_prefix="" to get cleaner column names
422
+ llm_parser = LLMParserBlock(
423
+ block_name="extract_eval",
424
+ input_cols=["eval_response"],
425
+ extract_content=True,
426
+ field_prefix="eval_" # Results in "eval_content" instead of "extract_content"
427
+ )
428
+
429
+ # Step 3: Parse structured fields from text
430
+ text_parser = TextParserBlock(
431
+ block_name="parse_fields",
432
+ input_cols=["eval_content"],
433
+ output_cols=["score", "reasoning"],
434
+ start_tags=["[SCORE]", "[REASONING]"],
435
+ end_tags=["[/SCORE]", "[/REASONING]"]
436
+ )
437
+
438
+ # Execute in sequence (or use a Flow)
439
+ dataset = Dataset.from_dict({
440
+ "messages": [[{"role": "user", "content": "Evaluate this text..."}]]
441
+ })
442
+
443
+ result = chat_block.generate(dataset)
444
+ result = llm_parser.generate(result)
445
+ result = text_parser.generate(result)
446
+
447
+ print(result["score"]) # Extracted score
448
+ print(result["reasoning"]) # Extracted reasoning
449
+ ```
450
+
451
+ ### Configuration Reference
452
+
453
+ **Required Parameters:**
454
+ - `block_name` - Unique identifier for the block
455
+ - `input_cols` - Single column containing text to parse
456
+ - `output_cols` - List of field names for extracted content
457
+
458
+ **Parsing Methods (choose one):**
459
+ - **Tag-based**: `start_tags` + `end_tags` (must have same length as `output_cols`)
460
+ - **Regex**: `parsing_pattern` (single regex with capture groups)
461
+
462
+ **Optional Parameters:**
463
+ - `parser_cleanup_tags` - List of tags to remove from extracted text
464
+ - `expand_lists` - Whether to expand list inputs into rows (default: `True`)
465
+
466
+ **Tag Parsing Rules:**
467
+ - Number of tag pairs must match number of output columns
468
+ - Each tag pair extracts all matches for that field
469
+ - Tags can be any string (XML-style, markdown-style, custom)
470
+ - Missing tags result in empty lists for that field
237
471
 
238
472
  ## 🚀 Next Steps
239
473
 
@@ -148,9 +148,22 @@ Every block validates data at runtime:
148
148
  ## 🚀 Best Practices
149
149
 
150
150
  ### 1. Start Small
151
- - Use `dry_run()` to test with small samples
151
+ - Use `dry_run()` to test with small samples before processing full datasets
152
+ - Add `enable_time_estimation=True` to predict execution time for the complete dataset
152
153
  - Validate your pipeline before scaling up
153
154
 
155
+ ```python
156
+ # Test AND estimate in one call
157
+ result = flow.dry_run(dataset, sample_size=5, enable_time_estimation=True, max_concurrency=100)
158
+
159
+ # Access dry run results
160
+ print(f"Tested with {result['sample_size']} samples")
161
+ print(f"Output columns: {result['final_dataset']['columns']}")
162
+
163
+ # Time estimation is automatically displayed in a Rich table format
164
+ # No need to access it programmatically - the table shows all estimation details
165
+ ```
166
+
154
167
  ### 2. Layer Validation
155
168
  - Use basic block composition (PromptBuilder → LLMChat → Parser → Filter) to assess quality
156
169
  - Implement filtering to maintain data standards
@@ -67,7 +67,44 @@ for flow_name in all_flows:
67
67
 
68
68
  ### Getting Flow Information
69
69
 
70
- #TODO: Add flow info example
70
+ Access detailed flow metadata and configuration:
71
+
72
+ ```python
73
+ from sdg_hub.core.flow import FlowRegistry, Flow
74
+
75
+ # Get metadata for a specific flow
76
+ flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
77
+ metadata = FlowRegistry.get_flow_metadata(flow_name)
78
+
79
+ if metadata:
80
+ print(f"Flow: {metadata.name}")
81
+ print(f"Version: {metadata.version}")
82
+ print(f"Author: {metadata.author}")
83
+ print(f"Description: {metadata.description}")
84
+ print(f"Tags: {', '.join(metadata.tags)}")
85
+ print(f"Recommended model: {metadata.recommended_models.get('default', 'Not specified')}")
86
+
87
+ # Load flow and get detailed information
88
+ flow_path = FlowRegistry.get_flow_path(flow_name)
89
+ flow = Flow.from_yaml(flow_path)
90
+
91
+ # Get comprehensive flow info
92
+ info = flow.get_info()
93
+ print(f"Total blocks: {info['total_blocks']}")
94
+ print(f"Block sequence: {', '.join(info['block_names'])}")
95
+
96
+ # Get dataset requirements
97
+ requirements = flow.get_dataset_requirements()
98
+ if requirements:
99
+ print(f"Required columns: {requirements.required_columns}")
100
+ print(f"Description: {requirements.description}")
101
+ print(f"Min samples: {requirements.min_samples}")
102
+
103
+ # Get model recommendations
104
+ recommendations = flow.get_model_recommendations()
105
+ print(f"Default model: {recommendations.get('default')}")
106
+ print(f"Compatible models: {recommendations.get('compatible', [])}")
107
+ ```
71
108
 
72
109
  ### Getting Flow Paths
73
110