sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Data transformation blocks for dataset manipulation.
3
+
4
+ This module provides blocks for transforming datasets including column operations,
5
+ wide-to-long transformations, value selection, and majority value assignment.
6
+ """
7
+
8
+ # Local
9
+ from .duplicate_columns import DuplicateColumnsBlock
10
+ from .index_based_mapper import IndexBasedMapperBlock
11
+ from .melt_columns import MeltColumnsBlock
12
+ from .rename_columns import RenameColumnsBlock
13
+ from .text_concat import TextConcatBlock
14
+ from .uniform_col_val_setter import UniformColumnValueSetter
15
+
16
+ __all__ = [
17
+ "TextConcatBlock",
18
+ "DuplicateColumnsBlock",
19
+ "MeltColumnsBlock",
20
+ "IndexBasedMapperBlock",
21
+ "RenameColumnsBlock",
22
+ "UniformColumnValueSetter",
23
+ ]
@@ -0,0 +1,88 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Duplicate columns block for dataset column duplication operations.
3
+
4
+ This module provides a block for duplicating existing columns with new names
5
+ according to a mapping specification.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import field_validator
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ @BlockRegistry.register(
24
+ "DuplicateColumnsBlock",
25
+ "transform",
26
+ "Duplicates existing columns with new names according to a mapping specification",
27
+ )
28
+ class DuplicateColumnsBlock(BaseBlock):
29
+ """Block for duplicating existing columns with new names.
30
+
31
+ This block creates copies of existing columns with new names according to a mapping specification.
32
+ The mapping is provided through input_cols as a dictionary.
33
+
34
+ Attributes
35
+ ----------
36
+ block_name : str
37
+ Name of the block.
38
+ input_cols : Dict[str, str]
39
+ Dictionary mapping existing column names to new column names.
40
+ Keys are existing column names, values are new column names.
41
+ """
42
+
43
+ @field_validator("input_cols", mode="after")
44
+ @classmethod
45
+ def validate_input_cols(cls, v):
46
+ """Validate that input_cols is a non-empty dict."""
47
+ if not v:
48
+ raise ValueError("input_cols cannot be empty")
49
+ if not isinstance(v, dict):
50
+ raise ValueError(
51
+ "input_cols must be a dictionary mapping existing column names to new column names"
52
+ )
53
+ return v
54
+
55
+ def model_post_init(self, __context: Any) -> None:
56
+ """Initialize derived attributes after Pydantic validation."""
57
+ super().model_post_init(__context) if hasattr(
58
+ super(), "model_post_init"
59
+ ) else None
60
+
61
+ # Set output_cols to the new column names being created
62
+ if self.output_cols is None:
63
+ self.output_cols = list(self.input_cols.values())
64
+
65
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
66
+ """Generate a dataset with duplicated columns.
67
+
68
+ Parameters
69
+ ----------
70
+ samples : Dataset
71
+ Input dataset to duplicate columns from.
72
+
73
+ Returns
74
+ -------
75
+ Dataset
76
+ Dataset with additional duplicated columns.
77
+ """
78
+ # Create a copy to avoid modifying the original
79
+ result = samples
80
+
81
+ # Duplicate each column as specified in the mapping
82
+ for source_col, target_col in self.input_cols.items():
83
+ if source_col not in result.column_names:
84
+ raise ValueError(f"Source column '{source_col}' not found in dataset")
85
+
86
+ result = result.add_column(target_col, result[source_col])
87
+
88
+ return result
@@ -0,0 +1,225 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Selector block for column value selection and mapping.
3
+
4
+ This module provides a block for selecting and mapping values from one column
5
+ to another based on a choice column's value.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import Field, field_validator, model_validator
14
+
15
+ # Local
16
+ from ...utils.error_handling import MissingColumnError
17
+ from ...utils.logger_config import setup_logger
18
+ from ..base import BaseBlock
19
+ from ..registry import BlockRegistry
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ @BlockRegistry.register(
25
+ "IndexBasedMapperBlock",
26
+ "transform",
27
+ "Maps values from source columns to output columns based on choice columns using shared mapping",
28
+ )
29
+ class IndexBasedMapperBlock(BaseBlock):
30
+ """Block for mapping values from source columns to output columns based on choice columns.
31
+
32
+ This block uses a shared mapping dictionary to select values from source columns and
33
+ store them in output columns based on corresponding choice columns' values.
34
+ The choice_cols and output_cols must have the same length - choice_cols[i] determines
35
+ the value for output_cols[i].
36
+
37
+ Attributes
38
+ ----------
39
+ block_name : str
40
+ Name of the block.
41
+ input_cols : Union[str, List[str], Dict[str, Any], None]
42
+ Input column specification. Should include choice columns and mapped columns.
43
+ output_cols : Union[str, List[str], Dict[str, Any], None]
44
+ Output column specification. Must have same length as choice_cols.
45
+ choice_map : Dict[str, str]
46
+ Dictionary mapping choice values to source column names.
47
+ choice_cols : List[str]
48
+ List of column names containing choice values. Must have same length as output_cols.
49
+ """
50
+
51
+ choice_map: dict[str, str] = Field(
52
+ ..., description="Dictionary mapping choice values to column names"
53
+ )
54
+ choice_cols: list[str] = Field(
55
+ ..., description="List of column names containing choice values"
56
+ )
57
+
58
+ @field_validator("choice_map")
59
+ @classmethod
60
+ def validate_choice_map(cls, v):
61
+ """Validate that choice_map is not empty."""
62
+ if not v:
63
+ raise ValueError("choice_map cannot be empty")
64
+ return v
65
+
66
+ @field_validator("choice_cols")
67
+ @classmethod
68
+ def validate_choice_cols_not_empty(cls, v):
69
+ """Validate that choice_cols is not empty."""
70
+ if not v:
71
+ raise ValueError("choice_cols cannot be empty")
72
+ return v
73
+
74
+ @model_validator(mode="after")
75
+ def validate_input_output_consistency(self):
76
+ """Validate that choice_cols and output_cols have same length and consistency."""
77
+ # Validate equal lengths
78
+ if len(self.choice_cols) != len(self.output_cols):
79
+ raise ValueError(
80
+ f"choice_cols and output_cols must have same length. "
81
+ f"Got choice_cols: {len(self.choice_cols)}, output_cols: {len(self.output_cols)}"
82
+ )
83
+
84
+ if isinstance(self.input_cols, list):
85
+ # Check that all choice_cols are in input_cols
86
+ missing_choice_cols = set(self.choice_cols) - set(self.input_cols)
87
+ if missing_choice_cols:
88
+ logger.warning(
89
+ f"Choice columns {missing_choice_cols} not found in input_cols {self.input_cols}"
90
+ )
91
+
92
+ # Check that all mapped columns are in input_cols
93
+ missing_mapped_cols = set(self.choice_map.values()) - set(self.input_cols)
94
+ if missing_mapped_cols:
95
+ logger.warning(
96
+ f"Mapped columns {missing_mapped_cols} not found in input_cols {self.input_cols}"
97
+ )
98
+
99
+ return self
100
+
101
+ def model_post_init(self, __context: Any) -> None:
102
+ """Initialize derived attributes after Pydantic validation."""
103
+ # Create mapping from choice_col to output_col for easy access
104
+ self.choice_to_output_map = dict(zip(self.choice_cols, self.output_cols))
105
+
106
+ def _validate_custom(self, samples: Dataset) -> None:
107
+ """Validate that required columns exist in the dataset.
108
+
109
+ Parameters
110
+ ----------
111
+ samples : Dataset
112
+ Input dataset to validate.
113
+
114
+ Raises
115
+ ------
116
+ MissingColumnError
117
+ If required columns are missing from the dataset.
118
+ ValueError
119
+ If choice values in data are not found in choice_map.
120
+ """
121
+ # Check that all choice_cols exist
122
+ missing_choice_cols = [
123
+ col for col in self.choice_cols if col not in samples.column_names
124
+ ]
125
+ if missing_choice_cols:
126
+ raise MissingColumnError(
127
+ block_name=self.block_name,
128
+ missing_columns=missing_choice_cols,
129
+ available_columns=samples.column_names,
130
+ )
131
+
132
+ # Check that all mapped columns exist
133
+ mapped_cols = list(self.choice_map.values())
134
+ missing_cols = list(set(mapped_cols) - set(samples.column_names))
135
+ if missing_cols:
136
+ raise MissingColumnError(
137
+ block_name=self.block_name,
138
+ missing_columns=missing_cols,
139
+ available_columns=samples.column_names,
140
+ )
141
+
142
+ # Check that all choice values in all choice columns have corresponding mappings
143
+ all_unique_choices = set()
144
+ for choice_col in self.choice_cols:
145
+ all_unique_choices.update(samples[choice_col])
146
+
147
+ mapped_choices = set(self.choice_map.keys())
148
+ unmapped_choices = all_unique_choices - mapped_choices
149
+
150
+ if unmapped_choices:
151
+ raise ValueError(
152
+ f"Choice values {sorted(unmapped_choices)} not found in choice_map for block '{self.block_name}'. "
153
+ f"Available choices in mapping: {sorted(mapped_choices)}"
154
+ )
155
+
156
+ def _generate(self, sample: dict[str, Any]) -> dict[str, Any]:
157
+ """Generate a new sample by selecting values based on choice mapping.
158
+
159
+ Parameters
160
+ ----------
161
+ sample : Dict[str, Any]
162
+ Input sample to process.
163
+
164
+ Returns
165
+ -------
166
+ Dict[str, Any]
167
+ Sample with selected values stored in corresponding output columns.
168
+ """
169
+ for choice_col, output_col in self.choice_to_output_map.items():
170
+ choice_value = sample[choice_col]
171
+ source_col = self.choice_map[
172
+ choice_value
173
+ ] # Safe since validated in _validate_custom
174
+ sample[output_col] = sample[source_col]
175
+ return sample
176
+
177
+ def generate(self, samples: Dataset) -> Dataset:
178
+ """Generate a new dataset with selected values.
179
+
180
+ Parameters
181
+ ----------
182
+ samples : Dataset
183
+ Input dataset to process.
184
+
185
+ Returns
186
+ -------
187
+ Dataset
188
+ Dataset with selected values stored in output column.
189
+ """
190
+ # Log the operation
191
+ all_unique_choices = set()
192
+ for choice_col in self.choice_cols:
193
+ all_unique_choices.update(samples[choice_col])
194
+ mapped_choices = set(self.choice_map.keys())
195
+
196
+ logger.info(
197
+ f"Mapping values based on choice columns for block '{self.block_name}'",
198
+ extra={
199
+ "block_name": self.block_name,
200
+ "choice_columns": self.choice_cols,
201
+ "output_columns": self.output_cols,
202
+ "choice_mappings": len(self.choice_map),
203
+ "unique_choices_in_data": len(all_unique_choices),
204
+ "unmapped_choices": len(all_unique_choices - mapped_choices),
205
+ },
206
+ )
207
+
208
+ # Apply the mapping
209
+ result = samples.map(self._generate)
210
+
211
+ # Log completion
212
+ logger.info(
213
+ f"Successfully applied choice mapping for block '{self.block_name}'",
214
+ extra={
215
+ "block_name": self.block_name,
216
+ "rows_processed": len(result),
217
+ "output_columns": self.output_cols,
218
+ "mapping_coverage": len(mapped_choices & all_unique_choices)
219
+ / len(all_unique_choices)
220
+ if all_unique_choices
221
+ else 0,
222
+ },
223
+ )
224
+
225
+ return result
@@ -0,0 +1,126 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Melt columns block for wide-to-long format transformation.
3
+
4
+ This module provides a block for transforming wide dataset format into long format
5
+ by melting specified columns into rows.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import field_validator
14
+
15
+ # Local
16
+ from ...utils.error_handling import MissingColumnError
17
+ from ...utils.logger_config import setup_logger
18
+ from ..base import BaseBlock
19
+ from ..registry import BlockRegistry
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ @BlockRegistry.register(
25
+ "MeltColumnsBlock",
26
+ "transform",
27
+ "Transforms wide dataset format into long format by melting columns into rows",
28
+ )
29
+ class MeltColumnsBlock(BaseBlock):
30
+ """Block for flattening multiple columns into a long format.
31
+
32
+ This block transforms a wide dataset format into a long format by melting
33
+ specified columns into rows, creating new variable and value columns.
34
+
35
+ The input_cols should contain the columns to be melted (variable columns).
36
+ The output_cols must specify exactly two columns: [value_column, variable_column].
37
+ Any other columns in the dataset will be treated as ID columns and preserved.
38
+
39
+ Attributes
40
+ ----------
41
+ block_name : str
42
+ Name of the block.
43
+ input_cols : Union[str, List[str], Dict[str, Any], None]
44
+ Columns to be melted into rows (variable columns).
45
+ output_cols : Union[str, List[str], Dict[str, Any], None]
46
+ Output column specification. Must specify exactly two columns: [value_column, variable_column].
47
+ """
48
+
49
+ @field_validator("input_cols", mode="after")
50
+ @classmethod
51
+ def validate_input_cols(cls, v):
52
+ """Validate that input_cols is not empty."""
53
+ if not v:
54
+ raise ValueError("input_cols cannot be empty")
55
+ return v
56
+
57
+ @field_validator("output_cols", mode="after")
58
+ @classmethod
59
+ def validate_output_cols(cls, v):
60
+ """Validate that exactly two output columns are specified."""
61
+ if len(v) != 2:
62
+ raise ValueError(
63
+ f"MeltColumnsBlock expects exactly two output columns (value, variable), got {len(v)}: {v}"
64
+ )
65
+ return v
66
+
67
+ def model_post_init(self, __context: Any) -> None:
68
+ """Initialize derived attributes after Pydantic validation."""
69
+ super().model_post_init(__context) if hasattr(
70
+ super(), "model_post_init"
71
+ ) else None
72
+
73
+ # Derive value and variable column names from output_cols
74
+ self.value_name = self.output_cols[0] # First output column is value
75
+ self.var_name = self.output_cols[1] # Second output column is variable
76
+
77
+ # input_cols contains the columns to be melted (what was var_cols)
78
+ self.var_cols = (
79
+ self.input_cols if isinstance(self.input_cols, list) else [self.input_cols]
80
+ )
81
+
82
+ def _validate_custom(self, samples: Dataset) -> None:
83
+ """Validate that required columns exist in the dataset.
84
+
85
+ Parameters
86
+ ----------
87
+ samples : Dataset
88
+ Input dataset to validate.
89
+
90
+ Raises
91
+ ------
92
+ MissingColumnError
93
+ If required columns are missing from the dataset.
94
+ """
95
+ # Check that all var_cols exist in the dataset
96
+ missing_cols = list(set(self.var_cols) - set(samples.column_names))
97
+ if missing_cols:
98
+ raise MissingColumnError(
99
+ block_name=self.block_name,
100
+ missing_columns=missing_cols,
101
+ available_columns=samples.column_names,
102
+ )
103
+
104
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
105
+ """Generate a flattened dataset in long format.
106
+
107
+ Parameters
108
+ ----------
109
+ samples : Dataset
110
+ Input dataset to flatten.
111
+
112
+ Returns
113
+ -------
114
+ Dataset
115
+ Flattened dataset in long format with new variable and value columns.
116
+ """
117
+ # Use the original simple logic - just adapted to use derived attributes
118
+ df = samples.to_pandas()
119
+ id_cols = [col for col in samples.column_names if col not in self.var_cols]
120
+ flatten_df = df.melt(
121
+ id_vars=id_cols,
122
+ value_vars=self.var_cols,
123
+ value_name=self.value_name,
124
+ var_name=self.var_name,
125
+ )
126
+ return Dataset.from_pandas(flatten_df)
@@ -0,0 +1,69 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Rename columns block for dataset column renaming operations.
3
+
4
+ This module provides a block for renaming columns in datasets according
5
+ to a mapping specification.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import field_validator
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ @BlockRegistry.register(
24
+ "RenameColumnsBlock",
25
+ "transform",
26
+ "Renames columns in a dataset according to a mapping specification",
27
+ )
28
+ class RenameColumnsBlock(BaseBlock):
29
+ """Block for renaming columns in a dataset.
30
+
31
+ This block renames columns in a dataset according to a mapping specification.
32
+ The mapping is provided through input_cols as a dictionary.
33
+
34
+ Attributes
35
+ ----------
36
+ block_name : str
37
+ Name of the block.
38
+ input_cols : Dict[str, str]
39
+ Dictionary mapping existing column names to new column names.
40
+ Keys are existing column names, values are new column names.
41
+ """
42
+
43
+ @field_validator("input_cols", mode="after")
44
+ @classmethod
45
+ def validate_input_cols(cls, v):
46
+ """Validate that input_cols is a non-empty dict."""
47
+ if not v:
48
+ raise ValueError("input_cols cannot be empty")
49
+ if not isinstance(v, dict):
50
+ raise ValueError(
51
+ "input_cols must be a dictionary mapping old column names to new column names"
52
+ )
53
+ return v
54
+
55
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
56
+ """Generate a dataset with renamed columns.
57
+
58
+ Parameters
59
+ ----------
60
+ samples : Dataset
61
+ Input dataset to rename columns in.
62
+
63
+ Returns
64
+ -------
65
+ Dataset
66
+ Dataset with renamed columns.
67
+ """
68
+ # Rename columns using HuggingFace datasets method
69
+ return samples.rename_columns(self.input_cols)
@@ -0,0 +1,102 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Text concatenation block for dataset column combination operations.
3
+
4
+ This module provides a block for combining multiple columns into a single column
5
+ using a specified separator.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+
11
+ # Third Party
12
+ from datasets import Dataset
13
+ from pydantic import Field, field_validator
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ @BlockRegistry.register(
24
+ "TextConcatBlock",
25
+ "transform",
26
+ "Combines multiple columns into a single column using a specified separator",
27
+ )
28
+ class TextConcatBlock(BaseBlock):
29
+ """Block for combining multiple columns into a single column.
30
+
31
+ This block concatenates values from multiple columns into a single output column,
32
+ using a specified separator between values.
33
+
34
+ Attributes
35
+ ----------
36
+ block_name : str
37
+ Name of the block.
38
+ input_cols : list[str]
39
+ List of column names to combine.
40
+ output_cols : list[str]
41
+ List containing the single output column name.
42
+ separator : str
43
+ String to use as separator between combined values.
44
+ """
45
+
46
+ separator: str = Field(
47
+ default="\n\n", description="Separator to use between combined values"
48
+ )
49
+
50
+ @field_validator("input_cols", mode="after")
51
+ @classmethod
52
+ def validate_input_cols(cls, v):
53
+ """Validate that input_cols is a non-empty list."""
54
+ if not v:
55
+ raise ValueError("input_cols cannot be empty")
56
+ if not isinstance(v, list):
57
+ raise ValueError("input_cols must be a list of column names")
58
+ return v
59
+
60
+ @field_validator("output_cols", mode="after")
61
+ @classmethod
62
+ def validate_output_cols(cls, v):
63
+ """Validate that exactly one output column is specified."""
64
+ if not v or len(v) != 1:
65
+ raise ValueError("TextConcatBlock requires exactly one output column")
66
+ return v
67
+
68
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
69
+ """Generate a dataset with combined columns.
70
+
71
+ Parameters
72
+ ----------
73
+ samples : Dataset
74
+ Input dataset to process.
75
+
76
+ Returns
77
+ -------
78
+ Dataset
79
+ Dataset with combined values stored in output column.
80
+ """
81
+ if not self.output_cols:
82
+ raise ValueError("output_cols must be specified")
83
+
84
+ output_col = self.output_cols[0]
85
+
86
+ def _combine_columns(sample):
87
+ """Combine values from input columns."""
88
+ # Check that all input columns exist
89
+ for col in self.input_cols:
90
+ if col not in sample:
91
+ raise ValueError(f"Input column '{col}' not found in sample")
92
+
93
+ # Combine values using separator
94
+ combined_value = self.separator.join(
95
+ [str(sample[col]) for col in self.input_cols]
96
+ )
97
+ sample[output_col] = combined_value
98
+ return sample
99
+
100
+ # Apply the combination to all samples
101
+ result = samples.map(_combine_columns)
102
+ return result