sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,103 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: router
4
- config_path: configs/skills/router.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - route
8
- gen_kwargs:
9
- temperature: 0
10
- max_tokens: 5
11
- extra_body:
12
- guided_choice:
13
- - "coding"
14
- - "extraction"
15
- - "humanities"
16
- - "math"
17
- - "reasoning"
18
- - "roleplay"
19
- - "STEM"
20
- - "writing"
21
- - block_type: SamplePopulatorBlock
22
- block_config:
23
- block_name: icl_populator
24
- config_paths:
25
- - configs/skills/icl_examples/coding.yaml
26
- - configs/skills/icl_examples/extraction.yaml
27
- - configs/skills/icl_examples/humanities.yaml
28
- - configs/skills/icl_examples/math.yaml
29
- - configs/skills/icl_examples/reasoning.yaml
30
- - configs/skills/icl_examples/roleplay.yaml
31
- - configs/skills/icl_examples/STEM.yaml
32
- - configs/skills/icl_examples/writing.yaml
33
- column_name: route
34
- batch_kwargs:
35
- num_procs: 8
36
- - block_type: LLMBlock
37
- block_config:
38
- block_name: analyzer
39
- config_path: configs/skills/analyzer.yaml
40
- model_id: meta-llama/Llama-3.3-70B-Instruct
41
- output_cols:
42
- - analysis
43
- - rubric
44
- - block_type: LLMBlock
45
- block_config:
46
- block_name: critic
47
- config_path: configs/skills/critic.yaml
48
- model_id: meta-llama/Llama-3.3-70B-Instruct
49
- output_cols:
50
- - critique
51
- - block_type: LLMBlock
52
- block_config:
53
- block_name: planner
54
- config_path: configs/skills/planner.yaml
55
- model_id: meta-llama/Llama-3.3-70B-Instruct
56
- output_cols:
57
- - plan
58
- - block_type: LLMBlock
59
- block_config:
60
- block_name: revised_responder
61
- config_path: configs/skills/revised_responder.yaml
62
- model_id: meta-llama/Llama-3.3-70B-Instruct
63
- output_cols:
64
- - revised_response
65
- drop_columns:
66
- - icl_query
67
- - icl_response
68
- - icl_analysis
69
- - icl_rubric
70
- - icl_critique
71
- - icl_plan
72
- - icl_revised_response
73
- - block_type: LLMBlock
74
- block_config:
75
- block_name: judge
76
- config_path: configs/skills/judge.yaml
77
- model_id: meta-llama/Llama-3.3-70B-Instruct
78
- output_cols:
79
- - judgement
80
- - verdict
81
- - block_type: FilterByValueBlock
82
- block_config:
83
- block_name: filter_judgement
84
- filter_column: verdict
85
- filter_value:
86
- - Assistant A
87
- - Assistant B
88
- operation: operator.contains
89
- batch_kwargs:
90
- num_procs: 8
91
- - block_type: SelectorBlock
92
- block_config:
93
- block_name: response_selector
94
- choice_map:
95
- Assistant A: "response"
96
- Assistant B: "revised_response"
97
- choice_col: verdict
98
- output_col: chosen_response
99
- batch_kwargs:
100
- num_procs: 8
101
- drop_columns:
102
- - judgemnent
103
- - verdict
@@ -1,12 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_skill_freeform
4
- config_path: configs/skills/simple_generate_qa_freeform.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- temperature: 0.7
10
- max_tokens: 2048
11
- drop_duplicates:
12
- - output
@@ -1,12 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_skill_grounded
4
- config_path: configs/skills/simple_generate_qa_grounded.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- temperature: 0.7
10
- max_tokens: 2048
11
- drop_duplicates:
12
- - output
@@ -1,80 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_contexts
4
- config_path: configs/skills/contexts.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - context
8
- gen_kwargs:
9
- temperature: 0.7
10
- max_tokens: 2048
11
- n: 10
12
- seed: 42
13
- drop_duplicates:
14
- - context
15
- - block_type: LLMBlock
16
- block_config:
17
- block_name: gen_grounded_questions
18
- config_path: configs/skills/grounded_questions.yaml
19
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
20
- output_cols:
21
- - question
22
- batch_kwargs:
23
- num_samples: 3
24
- drop_duplicates:
25
- - question
26
- - block_type: LLMBlock
27
- block_config:
28
- block_name: eval_grounded_questions
29
- config_path: configs/skills/evaluate_grounded_questions.yaml
30
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
31
- output_cols:
32
- - evaluation
33
- - score
34
- - block_type: FilterByValueBlock
35
- block_config:
36
- block_name: filter_grounded_questions
37
- filter_column: score
38
- filter_value: 1.0
39
- operation: operator.eq
40
- convert_dtype: float
41
- batch_kwargs:
42
- num_procs: 8
43
- drop_columns:
44
- - evaluation
45
- - score
46
- - num_samples
47
- - block_type: LLMBlock
48
- block_config:
49
- block_name: gen_grounded_responses
50
- config_path: configs/skills/grounded_responses.yaml
51
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
52
- output_cols:
53
- - response
54
- - block_type: LLMBlock
55
- block_config:
56
- block_name: evaluate_grounded_qa_pair
57
- config_path: configs/skills/evaluate_grounded_pair.yaml
58
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
59
- output_cols:
60
- - evaluation
61
- - score
62
- - block_type: FilterByValueBlock
63
- block_config:
64
- block_name: filter_grounded_qa_pair
65
- filter_column: score
66
- filter_value: 2.0
67
- operation: operator.ge
68
- convert_dtype: float
69
- batch_kwargs:
70
- num_procs: 8
71
- - block_type: CombineColumnsBlock
72
- block_config:
73
- block_name: combine_question_and_context
74
- columns:
75
- - context
76
- - question
77
- output_col: question
78
- batch_kwargs:
79
- num_procs: 8
80
- batched: True
@@ -1,59 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_questions
4
- config_path: configs/skills/freeform_questions.yaml
5
- model_id: meta-llama/Llama-3.3-70B-Instruct
6
- output_cols:
7
- - question
8
- batch_kwargs:
9
- num_samples: 30
10
- drop_duplicates:
11
- - question
12
- - block_type: LLMBlock
13
- block_config:
14
- block_name: eval_questions
15
- config_path: configs/skills/evaluate_freeform_questions.yaml
16
- model_id: meta-llama/Llama-3.3-70B-Instruct
17
- output_cols:
18
- - evaluation
19
- - score
20
- - block_type: FilterByValueBlock
21
- block_config:
22
- block_name: filter_questions
23
- filter_column: score
24
- filter_value: 1.0
25
- operation: operator.eq
26
- convert_dtype: float
27
- batch_kwargs:
28
- num_procs: 8
29
- drop_columns:
30
- - evaluation
31
- - score
32
- - num_samples
33
- - block_type: LLMBlock
34
- block_config:
35
- block_name: gen_responses
36
- config_path: configs/skills/freeform_responses.yaml
37
- model_id: meta-llama/Llama-3.3-70B-Instruct
38
- output_cols:
39
- - response
40
- - block_type: LLMBlock
41
- block_config:
42
- block_name: evaluate_qa_pair
43
- config_path: configs/skills/evaluate_freeform_pair.yaml
44
- model_id: meta-llama/Llama-3.3-70B-Instruct
45
- output_cols:
46
- - evaluation
47
- - score
48
- - block_type: FilterByValueBlock
49
- block_config:
50
- block_name: filter_qa_pair
51
- filter_column: score
52
- filter_value: 2.0
53
- operation: operator.ge
54
- convert_dtype: float
55
- batch_kwargs:
56
- num_procs: 8
57
- drop_columns:
58
- - evaluation
59
- - score
sdg_hub/pipeline.py DELETED
@@ -1,121 +0,0 @@
1
- """
2
- Deprecated Pipeline class for data generation pipelines.
3
-
4
- Use the Flow class directly for new code.
5
- """
6
-
7
- # SPDX-License-Identifier: Apache-2.0
8
- # Standard
9
- import warnings
10
- from typing import List, Dict, Any
11
-
12
- # Third Party
13
- from datasets import Dataset
14
- from datasets.data_files import EmptyDatasetError
15
-
16
- # Local
17
- from .logger_config import setup_logger
18
-
19
- logger = setup_logger(__name__)
20
-
21
-
22
- class Pipeline:
23
- """A class representing a data generation pipeline.
24
-
25
- This class is deprecated and will be removed in a future version.
26
- Use the Flow class directly instead.
27
-
28
- Parameters
29
- ----------
30
- chained_blocks : List[Dict[str, Any]]
31
- List of block configurations to execute in sequence.
32
-
33
- Attributes
34
- ----------
35
- chained_blocks : List[Dict[str, Any]]
36
- List of block configurations to execute in sequence.
37
- """
38
-
39
- def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
40
- """
41
- Initialize the Pipeline class with a configuration dictionary.
42
-
43
- DEPRECATED: This class is deprecated and will be removed in a future version.
44
- Use the Flow class directly instead.
45
- """
46
- warnings.warn(
47
- "Pipeline class is deprecated and will be removed in a future version. "
48
- "Use Flow class directly instead of wrapping it with Pipeline.",
49
- DeprecationWarning,
50
- stacklevel=2
51
- )
52
- # pipeline config is the run configuration that consists of the pipeline steps
53
- self.chained_blocks = chained_blocks
54
-
55
- def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
56
- """Drop duplicates from the dataset based on the columns provided.
57
-
58
- Parameters
59
- ----------
60
- dataset : Dataset
61
- The input dataset.
62
- cols : List[str]
63
- Columns to consider for duplicate detection.
64
-
65
- Returns
66
- -------
67
- Dataset
68
- Dataset with duplicates removed.
69
- """
70
- df = dataset.to_pandas()
71
- df = df.drop_duplicates(subset=cols).reset_index(drop=True)
72
- return Dataset.from_pandas(df)
73
-
74
- def generate(self, dataset: Dataset) -> Dataset:
75
- """Generate the dataset by running the pipeline steps.
76
-
77
- Parameters
78
- ----------
79
- dataset : Dataset
80
- The input dataset to process.
81
-
82
- Returns
83
- -------
84
- Dataset
85
- The processed dataset.
86
-
87
- Raises
88
- ------
89
- EmptyDatasetError
90
- If a block produces an empty dataset.
91
- """
92
- for block_prop in self.chained_blocks:
93
- block_type = block_prop["block_type"]
94
- block_config = block_prop["block_config"]
95
- drop_columns = block_prop.get("drop_columns", [])
96
- gen_kwargs = block_prop.get("gen_kwargs", {})
97
- drop_duplicates_cols = block_prop.get("drop_duplicates", False)
98
- block = block_type(**block_config)
99
-
100
- logger.debug("------------------------------------\n")
101
- logger.debug("Running block: %s", block_config["block_name"])
102
- logger.debug("Input dataset: %s", dataset)
103
-
104
- dataset = block.generate(dataset, **gen_kwargs)
105
-
106
- if len(dataset) == 0:
107
- raise EmptyDatasetError(
108
- f"Pipeline stopped: Empty dataset after running block: {block_config['block_name']}"
109
- )
110
-
111
- drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
112
- if drop_columns:
113
- dataset = dataset.remove_columns(drop_columns_in_ds)
114
-
115
- if drop_duplicates_cols:
116
- dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
117
-
118
- logger.debug("Output dataset: %s", dataset)
119
- logger.debug("------------------------------------\n\n")
120
-
121
- return dataset
sdg_hub/prompts.py DELETED
@@ -1,80 +0,0 @@
1
- # Local
2
- from .registry import PromptRegistry
3
-
4
-
5
- @PromptRegistry.register("blank")
6
- def blank_chat_template():
7
- return """{{ messages }}"""
8
-
9
-
10
- @PromptRegistry.register("instructlab")
11
- def instructlab_chat_template():
12
- return """{% for message in messages %}{% if message['role'] == 'pretraining' %}{{ '<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}{% elif message['role'] == 'system' %}{{ '<|system|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' + '\n' }}{% endif %}{% endfor %}"""
13
-
14
-
15
- @PromptRegistry.register("mistralai/Mixtral")
16
- def mistral_chat_template():
17
- return """{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n<s>\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + '</s>'}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n"""
18
-
19
-
20
- @PromptRegistry.register("meta-llama/Llama-3.3")
21
- def meta_llama_chat_template():
22
- return """{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
23
-
24
-
25
- @PromptRegistry.register("microsoft/phi-4")
26
- def microsoft_phi_chat_template():
27
- return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
28
-
29
-
30
- @PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
31
- def nemotron_chat_template():
32
- """
33
- Format chat messages for the Nemotron model, including a system prompt and structured message headers.
34
-
35
- The template starts with a system message containing "detailed thinking on", then iterates over messages, wrapping each with start and end header tokens and an end-of-text token. For assistant messages containing a `</think>` tag, only the content after this tag is included. Optionally appends an assistant prompt if generation is requested.
36
- """
37
- return """{{- bos_token }}
38
- {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
39
- {%- for message in messages %}
40
- {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
41
- {%- set content = message['content'].split('</think>')[-1].lstrip() %}
42
- {%- else %}
43
- {%- set content = message['content'] %}
44
- {%- endif %}
45
- {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
46
- {%- endfor %}
47
- {%- if add_generation_prompt %}
48
- {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
49
- {%- endif %}"""
50
-
51
-
52
- @PromptRegistry.register("Qwen/Qwen2.5")
53
- def qwen_2_5_chat_template():
54
- """
55
- Formats chat messages into the prompt structure required by the Qwen 2.5 model family, supporting system messages, tool descriptions, function call instructions, and role-based message formatting.
56
-
57
- If tools are provided, includes tool signatures and instructions for function calls in the system prompt. User, assistant, and tool messages are wrapped with special tokens, and assistant tool calls are serialized as JSON within XML tags. Optionally appends a generation prompt for the assistant.
58
- """
59
- return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- messages[0][\'content\'] }}\n {%- else %}\n {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n {%- endif %}\n {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0][\'role\'] == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n {%- else %}\n {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {{- \'<|im_start|>\' + message.role }}\n {%- if message.content %}\n {{- \'\\n\' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'\\n<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {{- tool_call.arguments | tojson }}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- message.content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n"""
60
-
61
-
62
- @PromptRegistry.register("Qwen/Qwen3")
63
- def qwen_3_chat_template():
64
- """
65
- Formats chat messages for the Qwen 3 model family, supporting multi-step tool usage, reasoning content, and special XML tags for tool calls and responses.
66
-
67
- This template handles system messages, user and assistant roles, and tool interactions. When tools are provided, it outputs their signatures and instructions for function calls. It tracks the last user query to determine where to insert assistant reasoning content within `<think>` tags. Assistant tool calls are serialized as JSON within `<tool_call>` tags, and tool responses are grouped inside `<tool_response>` tags. Optionally, a generation prompt and empty reasoning block can be added.
68
-
69
- Parameters:
70
- tools (optional): List of tool signature objects to be included in the prompt.
71
- messages: List of message objects, each with a role and content, and optionally tool_calls or reasoning_content.
72
- add_generation_prompt (optional): If true, appends an assistant prompt for generation.
73
- enable_thinking (optional): If false, inserts an empty reasoning block in the assistant prompt.
74
- """
75
- return """{%- if tools %}\n {{- \'<|im_start|>system\\n\' }}\n {%- if messages[0].role == \'system\' %}\n {{- messages[0].content + \'\\n\\n\' }}\n {%- endif %}\n {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n {%- for tool in tools %}\n {{- "\\n" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n {%- if messages[0].role == \'system\' %}\n {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith(\'<tool_response>\') and message.content.endswith(\'</tool_response>\')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = \'\' %}\n {%- endif %}\n {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content + \'<|im_end|>\' + \'\\n\' }}\n {%- elif message.role == "assistant" %}\n {%- set reasoning_content = \'\' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if \'</think>\' in content %}\n {%- set reasoning_content = content.split(\'</think>\')[0].rstrip(\'\\n\').split(\'<think>\')[-1].lstrip(\'\\n\') %}\n {%- set content = content.split(\'</think>\')[-1].lstrip(\'\\n\') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- \'<|im_start|>\' + message.role + \'\\n<think>\\n\' + reasoning_content.strip(\'\\n\') + \'\\n</think>\\n\\n\' + content.lstrip(\'\\n\') }}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- else %}\n {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- \'\\n\' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- \'<tool_call>\\n{"name": "\' }}\n {{- tool_call.name }}\n {{- \'", "arguments": \' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- \'}\\n</tool_call>\' }}\n {%- endfor %}\n {%- endif %}\n {{- \'<|im_end|>\\n\' }}\n {%- elif message.role == "tool" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}\n {{- \'<|im_start|>user\' }}\n {%- endif %}\n {{- \'\\n<tool_response>\\n\' }}\n {{- content }}\n {{- \'\\n</tool_response>\' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n {{- \'<|im_end|>\\n\' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- \'<|im_start|>assistant\\n\' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- \'<think>\\n\\n</think>\\n\\n\' }}\n {%- endif %}\n{%- endif %}"""
76
-
77
-
78
- @PromptRegistry.register("mistralai/Mistral-Small-3")
79
- def mistral_small_3_chat_template():
80
- return """{%- if not date_string is defined %}\n {%- set date_string = \"2025-01-01\" %}\n{%- endif %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + date_string + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n<s>\n\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- elif message['role'] == 'system' %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- elif message['role'] == 'assistant' %}\n {{- message['content'] + '</s>' }}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}"""
sdg_hub/registry.py DELETED
@@ -1,122 +0,0 @@
1
- # Standard
2
- from typing import Union, List, Dict
3
-
4
- # Third Party
5
- from jinja2 import Template
6
-
7
- # Local
8
- from .logger_config import setup_logger
9
-
10
- logger = setup_logger(__name__)
11
-
12
-
13
- class BlockRegistry:
14
- """Registry for block classes to avoid manual additions to block type map."""
15
-
16
- _registry: Dict[str, type] = {}
17
-
18
- @classmethod
19
- def register(cls, block_name: str):
20
- """
21
- Decorator to register a block class under a specified name.
22
-
23
- :param block_name: Name under which to register the block.
24
- """
25
-
26
- def decorator(block_class):
27
- cls._registry[block_name] = block_class
28
- logger.debug(
29
- f"Registered block '{block_name}' with class '{block_class.__name__}'"
30
- )
31
- return block_class
32
-
33
- return decorator
34
-
35
- @classmethod
36
- def get_registry(cls):
37
- """
38
- Retrieve the current registry map of block types.
39
-
40
- :return: Dictionary of registered block names and classes.
41
- """
42
- logger.debug("Fetching the block registry map.")
43
- return cls._registry
44
-
45
-
46
- class PromptRegistry:
47
- """Registry for managing Jinja2 prompt templates."""
48
-
49
- _registry: Dict[str, Template] = {}
50
-
51
- @classmethod
52
- def register(cls, name: str):
53
- """Decorator to register a Jinja2 template function by name.
54
-
55
- :param name: Name of the template to register.
56
- :return: A decorator that registers the Jinja2 template function.
57
- """
58
-
59
- def decorator(func):
60
- template_str = func()
61
- cls._registry[name] = Template(template_str)
62
- logger.debug(f"Registered prompt template '{name}'")
63
- return func
64
-
65
- return decorator
66
-
67
- @classmethod
68
- def get_template(cls, name: str) -> Template:
69
- """Retrieve a Jinja2 template by name.
70
-
71
- :param name: Name of the template to retrieve.
72
- :return: The Jinja2 template instance.
73
- """
74
- if name not in cls._registry:
75
- raise KeyError(f"Template '{name}' not found.")
76
- logger.debug(f"Retrieving prompt template '{name}'")
77
- return cls._registry[name]
78
-
79
- @classmethod
80
- def get_registry(cls):
81
- """
82
- Retrieve the current registry map of block types.
83
-
84
- :return: Dictionary of registered block names and classes.
85
- """
86
- logger.debug("Fetching the block registry map.")
87
- return cls._registry
88
-
89
- @classmethod
90
- def render_template(
91
- cls,
92
- name: str,
93
- messages: Union[str, List[Dict[str, str]]],
94
- add_generation_prompt: bool = True,
95
- ) -> str:
96
- """Render the template with the provided messages or query.
97
-
98
- :param name: Name of the template to render.
99
- :param messages: Either a single query string or a list of messages (each as a dict with 'role' and 'content').
100
- :param add_generation_prompt: Whether to add a generation prompt at the end.
101
- :return: The rendered prompt as a string.
102
- """
103
-
104
- # Special handling for "blank" template
105
- if name == "blank":
106
- if not isinstance(messages, str):
107
- raise ValueError(
108
- "The 'blank' template can only be used with a single query string, not a list of messages."
109
- )
110
- return messages # Return the query as-is without templating
111
-
112
- # Get the template
113
- template = cls.get_template(name)
114
-
115
- # If `messages` is a string, wrap it in a list with a default user role
116
- if isinstance(messages, str):
117
- messages = [{"role": "user", "content": messages}]
118
-
119
- # Render the template with the `messages` list
120
- return template.render(
121
- messages=messages, add_generation_prompt=add_generation_prompt
122
- )