sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,13 @@
1
+ - role: system
2
+ content: You are an AI assistant that is expert at summarizing text.
3
+
4
+ - role: user
5
+ content: |
6
+ Give me detailed summary for below document, making sure all key points are covered.
7
+
8
+ Do not add any new information.
9
+ Do not miss any key points from the provided document
10
+
11
+ Document:
12
+ {{document_outline}}
13
+ {{document}}
@@ -0,0 +1,64 @@
1
+ - role: system
2
+ content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
+
4
+ - role: user
5
+ content: |
6
+ Determine if the provided information is corroborated by the given context. Respond with YES if the context substantiates the information, even partially. Answer NO if the context does not support the information.
7
+
8
+ Guidelines:
9
+ - Answer YES when the context provides either direct or indirect evidence supporting the information. Indirect evidence may include contextual implications or inferred connections that reasonably support the information.
10
+ - Answer NO if the context lacks any supportive evidence, clearly contradicts the information, or if the support provided by the context is too vague or speculative to establish a solid connection to the information.
11
+ - Avoid using "partially" in your response. If the context provides any reasonable support (direct or indirect) for the information, consider it as a YES.
12
+
13
+ Strictly answer in this format:
14
+ [Start of Context]
15
+ ...
16
+ [End of Context]
17
+ [Start of Response]
18
+ ...
19
+ [End of Response]
20
+ [Start of Explanation]
21
+ ...
22
+ [End of Explanation]
23
+ [Start of Answer]
24
+ ...
25
+ [End of Answer]
26
+
27
+ Example 1:
28
+ [Start of Context]
29
+ An apple pie is a fruit pie with apples as the main filling. It's often served with whipped cream, ice cream, custard, or cheddar cheese. Typically, it has a double crust, with pastry above and below the filling. The upper crust can be solid or latticed.
30
+ [End of Context]
31
+ [Start of Response]
32
+ Apple pie is generally double-crusted.
33
+ [End of Response]
34
+ [Start of Explanation]
35
+ The context directly supports the information by stating that apple pie is "generally double-crusted," which matches the information provided.
36
+ [End of Explanation]
37
+ [Start of Answer]
38
+ YES
39
+ [End of Answer]
40
+
41
+ Example 2:
42
+ [Start of Context]
43
+ An apple pie is a fruit pie with apples as the main filling. It's often served with whipped cream, ice cream, custard, or cheddar cheese. Typically, it has a double crust, with pastry above and below the filling. The upper crust can be solid or latticed.
44
+ [End of Context]
45
+ [Start of Response]
46
+ Apple pies taste bad.
47
+ [End of Response]
48
+ [Start of Explanation]
49
+ The context does not provide any information about the taste of apple pies. The statement "Apple pies taste bad" is a subjective opinion and is not supported or mentioned in the given context.
50
+ [End of Explanation]
51
+ [Start of Answer]
52
+ NO
53
+ [End of Answer]
54
+
55
+ Now, based on the above examples and guidelines, determine if the following information is supported by the context provided. Answer YES or NO.
56
+ * Return the explanation within the [Start of Explanation] and [End of Explanation] tags.
57
+ * Return the answer between [Start of Answer] and [End of Answer] tags.
58
+
59
+ [Start of Context]
60
+ {{document}}
61
+ [End of Context]
62
+ [Start of Response]
63
+ {{response}}
64
+ [End of Response]
@@ -0,0 +1,29 @@
1
+ - role: system
2
+ content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
+
4
+ - role: user
5
+ content: |
6
+ Given below question can you verify if it meets below requirements and based on them give a rating of 1 if it meets all of them or 0 otherwise.
7
+
8
+ Here are the requirements:
9
+
10
+ Non-Referential Clarity and Contextual Independence: Ensure that the question is self-explanatory and does not rely on specific, unprovided external content, such as particular documents, specific tables, or detailed datasets. The question should be structured to be understandable and clear without requiring direct access to or knowledge of these specific external sources.
11
+
12
+ Subject-Aware Completeness: The question should be crafted to be answerable on its own, given a reasonable level of specialized knowledge in the relevant subject area. It is acceptable and encouraged for the question to require specialized understanding pertinent to the topic; however, it should not depend on unique, external information not provided in the question itself. This distinction allows for questions that necessitate a deep understanding of a subject while ensuring they are not tied to specific external content like a particular dataset or a line in a document.
13
+
14
+ Please give your answer as short explanation followed by rating of either 0 or 1 as below.
15
+
16
+ * Return a short explanation within the [Start of Explanation] and [End of Explanation] tags.
17
+ * Return the rating on a binary 0/1 scale between [Start of Rating] and [End of Rating] tags.
18
+
19
+ [Start of Question]
20
+ {{question}}
21
+ [End of Question]
22
+
23
+ [Start of Explanation]
24
+ ...
25
+ [End of Explanation]
26
+
27
+ [Start of Rating]
28
+ ...
29
+ [End of Rating]
@@ -0,0 +1,81 @@
1
+ - role: system
2
+ content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
+
4
+ - role: user
5
+ content: |
6
+ Your task is to assess the relevance of a given response to a specific query. This evaluation should be conducted methodically by answering two key questions:
7
+
8
+ 1. Subject Matter Relevance: Does the provided response accurately match the subject matter of the user's query? This question aims to determine if the response is directly related to the main topic or issue presented in the query.
9
+ 2. Focus and Perspective Addressing: Does the provided response effectively address the focus or perspective on the subject matter as outlined in the user's query? This question seeks to evaluate whether the response not only matches the subject matter but also aligns with the specific angle or concern raised by the user.
10
+
11
+ For each question, assign a score of 1 point if the response meets the criteria, and 0 points if it does not. After evaluating each question, provide detailed feedback explaining your reasoning behind the scores awarded.
12
+
13
+ Conclude your evaluation with a total score as a final result. The total score should represent the sum of points assigned for each question, with a maximum possible score of 2 points.
14
+ Only evaluate the response based on the above criteria, do not create new questions.
15
+
16
+ Example 1:
17
+ [Start of Question]
18
+ What is the impact of global warming on polar bears?
19
+ [End of Question]
20
+
21
+ [Start of Response]
22
+ Global warming leads to melting ice caps, reducing the habitat of polar bears and negatively impacting their hunting grounds.
23
+ [End of Response]
24
+
25
+ [Start of Feedback]
26
+ - Subject Matter Relevance Score: 1 (The response is directly related to the impact of global warming on polar bears.)
27
+ - Alignment with Query's Focus Score: 1 (The response specifically addresses how global warming affects polar bears' habitat and hunting grounds.)
28
+ [End of Feedback]
29
+
30
+ [Start of Score]
31
+ 2
32
+ [End of Score]
33
+
34
+ Example 2:
35
+ [Start of Question]
36
+ How does photosynthesis work?
37
+ [End of Question]
38
+
39
+ [Start of Response]
40
+ Plants require sunlight and water to grow.
41
+ [End of Response]
42
+
43
+ [Start of Feedback]
44
+ - Subject Matter Relevance Score: 0 (The response is related to plant growth, but does not specifically address the process of photosynthesis.)
45
+ - Alignment with Query's Focus Score: 0 (The response fails to detail the photosynthesis process, missing the specific focus of the query.)
46
+ [End of Feedback]
47
+
48
+ [Start of Score]
49
+ 0
50
+ [End of Score]
51
+
52
+ Example 3:
53
+ [Start of Question]
54
+ What are the benefits of electric vehicles?
55
+ [End of Question]
56
+
57
+ [Start of Response]
58
+ Electric vehicles reduce dependency on fossil fuels and decrease greenhouse gas emissions.
59
+ [End of Response]
60
+
61
+ [Start of Feedback]
62
+ - Subject Matter Relevance Score: 1 (The response matches the query's subject on the benefits of electric vehicles.)
63
+ - Alignment with Query's Focus Score: 1 (The response effectively addresses the environmental benefits of electric vehicles, aligning with the query's focus.)
64
+ [End of Feedback]
65
+
66
+ [Start of Score]
67
+ 2
68
+ [End of Score]
69
+
70
+ Begin your response by providing the feedback followed by the score. Be as objective as possible.
71
+
72
+ [Start of Question]
73
+ {{question}}
74
+ [End of Question]
75
+
76
+ [Start of Response]
77
+ {{response}}
78
+ [End of Response]
79
+
80
+ * Return the feedback within the [Start of Feedback] and [End of Feedback] tags.
81
+ * Return the final score between [Start of Score] and [End of Score] tags.
@@ -0,0 +1,13 @@
1
+ - role: system
2
+ content: You are an AI assistant that is expert at summarizing text.
3
+
4
+ - role: user
5
+ content: |
6
+ Give me detailed extractive summary for below document, making sure all key points are covered.
7
+
8
+ Do not add any new information.
9
+ Do not miss any key points from the provided document
10
+
11
+ Document:
12
+ {{document_outline}}
13
+ {{document}}
@@ -0,0 +1,192 @@
1
+ metadata:
2
+ id: small-rock-799
3
+ name: "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
4
+ description: "A comprehensive flow that generates high-quality question-answer pairs from input documents using multiple LLM blocks for question generation, answer synthesis, and quality evaluation."
5
+ version: "1.0.0"
6
+ author: "SDG Hub Contributors"
7
+
8
+ recommended_models:
9
+ default: "meta-llama/Llama-3.3-70B-Instruct"
10
+ compatible: ["microsoft/phi-4", "mistralai/Mixtral-8x7B-Instruct-v0.1"]
11
+ experimental: []
12
+
13
+ tags:
14
+ - "question-generation"
15
+ - "knowledge-extraction"
16
+ - "qa-pairs"
17
+ - "document-processing"
18
+ - "educational"
19
+
20
+ license: "Apache-2.0"
21
+ min_sdg_hub_version: "0.2.0"
22
+
23
+ dataset_requirements:
24
+ required_columns:
25
+ - "document"
26
+ - "document_outline"
27
+ - "domain"
28
+ - "icl_document"
29
+ - "icl_query_1"
30
+ - "icl_response_1"
31
+ - "icl_query_2"
32
+ - "icl_response_2"
33
+ - "icl_query_3"
34
+ - "icl_response_3"
35
+ description: "Input dataset should contain documents with text content and domain classification. Each document should be substantial enough for meaningful question generation (minimum 100 words recommended)."
36
+
37
+ blocks:
38
+ - block_type: DuplicateColumnsBlock
39
+ block_config:
40
+ block_name: duplicate_document_col
41
+ input_cols: {document: base_document}
42
+
43
+ - block_type: PromptBuilderBlock
44
+ block_config:
45
+ block_name: detailed_summary_prompt
46
+ input_cols: [document, document_outline]
47
+ output_cols: summary_prompt
48
+ prompt_config_path: detailed_summary.yaml
49
+ format_as_messages: true
50
+
51
+ - block_type: LLMChatBlock
52
+ block_config:
53
+ block_name: gen_detailed_summary
54
+ input_cols: summary_prompt
55
+ output_cols: raw_summary_detailed
56
+ max_tokens: 2048
57
+ async_mode: true
58
+
59
+ - block_type: TextParserBlock
60
+ block_config:
61
+ block_name: parse_detailed_summary
62
+ input_cols: raw_summary_detailed
63
+ output_cols: summary_detailed
64
+ start_tags: [""]
65
+ end_tags: [""]
66
+
67
+ - block_type: PromptBuilderBlock
68
+ block_config:
69
+ block_name: atomic_facts_prompt
70
+ input_cols: [document, document_outline, domain]
71
+ output_cols: atomic_facts_prompt
72
+ prompt_config_path: atomic_facts.yaml
73
+ format_as_messages: true
74
+
75
+ - block_type: LLMChatBlock
76
+ block_config:
77
+ block_name: gen_atomic_facts
78
+ input_cols: atomic_facts_prompt
79
+ output_cols: raw_atomic_facts
80
+ max_tokens: 2048
81
+ async_mode: true
82
+
83
+ - block_type: TextParserBlock
84
+ block_config:
85
+ block_name: parse_atomic_facts
86
+ input_cols: raw_atomic_facts
87
+ output_cols: summary_atomic_facts
88
+ start_tags: [""]
89
+ end_tags: [""]
90
+
91
+ - block_type: PromptBuilderBlock
92
+ block_config:
93
+ block_name: extractive_summary_prompt
94
+ input_cols: [document, document_outline]
95
+ output_cols: extractive_summary_prompt
96
+ prompt_config_path: extractive_summary.yaml
97
+ format_as_messages: true
98
+
99
+ - block_type: LLMChatBlock
100
+ block_config:
101
+ block_name: gen_extractive_summary
102
+ input_cols: extractive_summary_prompt
103
+ output_cols: raw_summary_extractive
104
+ max_tokens: 2048
105
+ async_mode: true
106
+
107
+ - block_type: TextParserBlock
108
+ block_config:
109
+ block_name: parse_extractive_summary
110
+ input_cols: raw_summary_extractive
111
+ output_cols: summary_extractive
112
+ start_tags: [""]
113
+ end_tags: [""]
114
+
115
+ - block_type: MeltColumnsBlock
116
+ block_config:
117
+ block_name: melt_summary_columns
118
+ input_cols: [summary_detailed, summary_extractive, summary_atomic_facts, base_document]
119
+ output_cols: [summary, dataset_type]
120
+
121
+ - block_type: RenameColumnsBlock
122
+ block_config:
123
+ block_name: rename_to_document_column
124
+ input_cols: {document: raw_document, summary: document}
125
+
126
+ - block_type: PromptBuilderBlock
127
+ block_config:
128
+ block_name: knowledge_generation_prompt
129
+ input_cols: [domain, document, document_outline, icl_document, icl_query_1, icl_response_1, icl_query_2, icl_response_2, icl_query_3, icl_response_3]
130
+ output_cols: knowledge_generation_prompt
131
+ prompt_config_path: generate_questions_responses.yaml
132
+ format_as_messages: true
133
+
134
+ - block_type: LLMChatBlock
135
+ block_config:
136
+ block_name: knowledge_generation
137
+ input_cols: knowledge_generation_prompt
138
+ output_cols: raw_knowledge_generation
139
+ temperature: 0.0
140
+ max_tokens: 2048
141
+ async_mode: true
142
+
143
+ - block_type: TextParserBlock
144
+ block_config:
145
+ block_name: parse_knowledge_generation
146
+ input_cols: raw_knowledge_generation
147
+ output_cols: [question, response]
148
+ parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
149
+ parser_cleanup_tags: ["[END]"]
150
+
151
+ - block_type: EvaluateFaithfulnessBlock
152
+ block_config:
153
+ block_name: eval_faithfulness
154
+ input_cols: [document, response]
155
+ output_cols: [faithfulness_explanation, faithfulness_judgment]
156
+ prompt_config_path: evaluate_faithfulness.yaml
157
+ filter_value: "YES"
158
+ operation: eq
159
+ async_mode: true
160
+ format_as_messages: true
161
+ start_tags: ["[Start of Explanation]", "[Start of Answer]"]
162
+ end_tags: ["[End of Explanation]", "[End of Answer]"]
163
+
164
+ - block_type: EvaluateRelevancyBlock
165
+ block_config:
166
+ block_name: eval_relevancy
167
+ input_cols: [question, response]
168
+ output_cols: [relevancy_explanation, relevancy_score]
169
+ prompt_config_path: evaluate_relevancy.yaml
170
+ filter_value: 2.0
171
+ operation: eq
172
+ convert_dtype: float
173
+ max_tokens: 2048
174
+ async_mode: true
175
+ format_as_messages: true
176
+ start_tags: ["[Start of Feedback]", "[Start of Score]"]
177
+ end_tags: ["[End of Feedback]", "[End of Score]"]
178
+
179
+ - block_type: VerifyQuestionBlock
180
+ block_config:
181
+ block_name: verify_question
182
+ input_cols: [question]
183
+ output_cols: [verification_explanation, verification_rating]
184
+ prompt_config_path: evaluate_question.yaml
185
+ filter_value: 1.0
186
+ operation: ge
187
+ convert_dtype: float
188
+ max_tokens: 2048
189
+ async_mode: true
190
+ format_as_messages: true
191
+ start_tags: ["[Start of Explanation]", "[Start of Rating]"]
192
+ end_tags: ["[End of Explanation]", "[End of Rating]"]
@@ -0,0 +1,54 @@
1
+ - role: system
2
+ content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
+
4
+ - role: user
5
+ content: |
6
+ Develop a series of educational question and answer pairs from a chapter in a {{domain}} textbook.
7
+
8
+ The questions should:
9
+ * Be self-contained, not requiring references to tables, figures, or specific sections in the text for understanding.
10
+ * Focus on teaching and reinforcing the key knowledge and concepts presented in the chapter.
11
+ * Avoid sections with minimal educational content like index pages or prefaces. In such cases, respond with [UNANSWERABLE].
12
+ * Be directly relevant to the textbook's domain. For instance, in a science textbook, questions should revolve around scientific terms, definitions, and practical applications, while in a legal textbook, they should cover legal principles, case law, and precedents.
13
+ * Be formulated to allow for independent answers, avoiding direct references to specific theorems or text sections. For example, rather than asking 'Under what conditions is the fixed point of a function unique according to Theorem 3.1.5?', ask 'How does the Fixed Point Iteration method contribute to understanding function uniqueness?'
14
+ * Span a range of difficulty levels to accommodate a diverse student audience, from basic understanding to advanced comprehension.
15
+ * Include a variety of question types such as multiple-choice for basic recall, short answer for deeper understanding, and essay or problem-solving questions to test application and analysis skills.
16
+ * Align closely with the learning objectives of the textbook or the specific chapter, ensuring that the questions test the fundamental concepts and skills that the chapter aims to impart.
17
+
18
+ Strictly follow this format for each question answer pair your generate while responding:
19
+
20
+ [QUESTION]
21
+ <Insert question here>
22
+ [ANSWER]
23
+ <Insert answer here>
24
+ [END]
25
+
26
+ Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook.
27
+
28
+ Here are some examples of questions:
29
+
30
+ [Document]
31
+ {{icl_document}}
32
+
33
+ [QUESTION]
34
+ {{icl_query_1}}
35
+ [ANSWER]
36
+ {{icl_response_1}}
37
+ [END]
38
+
39
+ [QUESTION]
40
+ {{icl_query_2}}
41
+ [ANSWER]
42
+ {{icl_response_2}}
43
+ [END]
44
+
45
+ [QUESTION]
46
+ {{icl_query_3}}
47
+ [ANSWER]
48
+ {{icl_response_3}}
49
+ [END]
50
+
51
+ Now, here is the document:
52
+ [DOCUMENT]
53
+ {{document_outline}}
54
+ {{document}}
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.2.1
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Environment :: Console
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: MacOS :: MacOS X
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: Implementation :: CPython
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: click<9.0.0,>=8.1.7
26
+ Requires-Dist: datasets<4.0.0,>=2.18.0
27
+ Requires-Dist: httpx<1.0.0,>=0.25.0
28
+ Requires-Dist: jinja2
29
+ Requires-Dist: litellm<1.75.0,>=1.73.0
30
+ Requires-Dist: openai<2.0.0,>=1.13.3
31
+ Requires-Dist: rich
32
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
33
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.0
34
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
35
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
36
+ Provides-Extra: vllm
37
+ Requires-Dist: vllm>=0.9.1; extra == "vllm"
38
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
39
+ Requires-Dist: transformers>=4.37.0; extra == "vllm"
40
+ Requires-Dist: accelerate>=0.21.0; extra == "vllm"
41
+ Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
42
+ Provides-Extra: examples
43
+ Requires-Dist: tabulate>=0.9.0; extra == "examples"
44
+ Requires-Dist: transformers>=4.37.0; extra == "examples"
45
+ Requires-Dist: langchain-text-splitters; extra == "examples"
46
+ Requires-Dist: docling>=2.3.0; extra == "examples"
47
+ Requires-Dist: scikit-learn; extra == "examples"
48
+ Requires-Dist: pandas; extra == "examples"
49
+ Requires-Dist: polars; extra == "examples"
50
+ Requires-Dist: matplotlib; extra == "examples"
51
+ Requires-Dist: spacy; extra == "examples"
52
+ Requires-Dist: nltk; extra == "examples"
53
+ Requires-Dist: sentence-transformers; extra == "examples"
54
+ Requires-Dist: instructor; extra == "examples"
55
+ Requires-Dist: fastapi; extra == "examples"
56
+ Requires-Dist: nest-asyncio; extra == "examples"
57
+ Provides-Extra: dev
58
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
59
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
60
+ Requires-Dist: pylint-pydantic; extra == "dev"
61
+ Requires-Dist: pytest; extra == "dev"
62
+ Requires-Dist: pytest-asyncio; extra == "dev"
63
+ Requires-Dist: pytest-cov; extra == "dev"
64
+ Requires-Dist: pytest-html; extra == "dev"
65
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
66
+ Requires-Dist: ruff; extra == "dev"
67
+ Dynamic: license-file
68
+
69
+ # `sdg_hub`: Synthetic Data Generation Toolkit
70
+
71
+ [![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
72
+ [![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
73
+ [![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
74
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
75
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
76
+
77
+
78
+
79
+ A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.
80
+
81
+ **📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
82
+
83
+ ## ✨ Key Features
84
+
85
+ **🔧 Modular Composability** - Mix and match blocks like Lego pieces. Build simple transformations or complex multi-stage pipelines with YAML-configured flows.
86
+
87
+ **⚡ Async Performance** - High-throughput LLM processing with built-in error handling.
88
+
89
+ **🛡️ Built-in Validation** - Pydantic-based type safety ensures your configurations and data are correct before execution.
90
+
91
+ **🔍 Auto-Discovery** - Automatic block and flow registration. No manual imports or complex setup.
92
+
93
+ **📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
94
+
95
+ **🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
96
+
97
+
98
+ ## 📦 Installation
99
+
100
+ Recommended: Install uv — see https://docs.astral.sh/uv/getting-started/installation/
101
+
102
+ ```bash
103
+ # Production
104
+ uv pip install sdg-hub
105
+
106
+ # Development
107
+ git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
108
+ cd sdg_hub
109
+ uv pip install .[dev]
110
+ # or: uv sync --extra dev
111
+ ```
112
+
113
+ ### Optional Dependencies
114
+ ```bash
115
+ # For vLLM support
116
+ uv pip install sdg-hub[vllm]
117
+
118
+ # For examples
119
+ uv pip install sdg-hub[examples]
120
+ ```
121
+
122
+ ## 🚀 Quick Start
123
+
124
+ ### Core Concepts
125
+
126
+ **Blocks** are composable units that transform datasets - think of them as data processing Lego pieces. Each block performs a specific task: LLM chat, text parsing, evaluation, or transformation.
127
+
128
+ **Flows** orchestrate multiple blocks into complete pipelines defined in YAML. Chain blocks together to create complex data generation workflows with validation and parameter management.
129
+
130
+ ```python
131
+ # Simple concept: Blocks transform data, Flows chain blocks together
132
+ dataset → Block₁ → Block₂ → Block₃ → enriched_dataset
133
+ ```
134
+
135
+ ### Try it out!
136
+
137
+ #### Flow Discovery
138
+ ```python
139
+ from sdg_hub import FlowRegistry, Flow
140
+
141
+ # Auto-discover all available flows (no setup needed!)
142
+ FlowRegistry.discover_flows()
143
+
144
+ # List available flows
145
+ flows = FlowRegistry.list_flows()
146
+ print(f"Available flows: {flows}")
147
+
148
+ # Search for specific types
149
+ qa_flows = FlowRegistry.search_flows(tag="question-generation")
150
+ print(f"QA flows: {qa_flows}")
151
+ ```
152
+
153
+ Each flow has a **unique, human-readable ID** automatically generated from its name. These IDs provide a convenient shorthand for referencing flows:
154
+
155
+ ```python
156
+ # Every flow gets a deterministic ID
157
+ # Same flow name always generates the same ID
158
+ flow_id = "small-rock-799"
159
+
160
+ # Use ID to reference the flow
161
+ flow_path = FlowRegistry.get_flow_path(flow_id)
162
+ flow = Flow.from_yaml(flow_path)
163
+ ```
164
+
165
+ #### Discovering Models and Configuring them
166
+ ```python
167
+ # Discover recommended models
168
+ default_model = flow.get_default_model()
169
+ recommendations = flow.get_model_recommendations()
170
+
171
+ # Configure model settings at runtime
172
+ # This assumes you have a hosted vLLM instance of meta-llama/Llama-3.3-70B-Instruct running at http://localhost:8000/v1
173
+ flow.set_model_config(
174
+ model=f"hosted_vllm/{default_model}",
175
+ api_base="http://localhost:8000/v1",
176
+ api_key="your_key",
177
+ )
178
+ ```
179
+ #### Load your dataset and run the flow
180
+ ```python
181
+ # Create your dataset with required columns
182
+ dataset = Dataset.from_dict({
183
+ 'document': ['Your document text here...'],
184
+ 'document_outline': ['1. Topic A; 2. Topic B; 3. Topic C'],
185
+ 'domain': ['Computer Science'],
186
+ 'icl_document': ['Example document for in-context learning...'],
187
+ 'icl_query_1': ['Example question 1?'],
188
+ 'icl_response_1': ['Example answer 1'],
189
+ 'icl_query_2': ['Example question 2?'],
190
+ 'icl_response_2': ['Example answer 2'],
191
+ 'icl_query_3': ['Example question 3?'],
192
+ 'icl_response_3': ['Example answer 3']
193
+ })
194
+
195
+ # Quick Testing with Dry Run
196
+ dry_result = flow.dry_run(dataset, sample_size=1)
197
+ print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
198
+ print(f"Output columns: {dry_result['final_dataset']['columns']}")
199
+
200
+ # Generate high-quality QA pairs
201
+ result = flow.generate(dataset)
202
+
203
+ # Access generated content
204
+ questions = result['question']
205
+ answers = result['response']
206
+ faithfulness_scores = result['faithfulness_judgment']
207
+ relevancy_scores = result['relevancy_score']
208
+ ```
209
+
210
+
211
+ ## 📄 License
212
+
213
+ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
214
+
215
+ ## 🤝 Contributing
216
+
217
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project.
218
+
219
+ ---
220
+
221
+ Built with ❤️ by the Red Hat AI Innovation Team