sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Data transformation blocks for dataset manipulation.
|
3
|
+
|
4
|
+
This module provides blocks for transforming datasets including column operations,
|
5
|
+
wide-to-long transformations, value selection, and majority value assignment.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Local
|
9
|
+
from .duplicate_columns import DuplicateColumnsBlock
|
10
|
+
from .index_based_mapper import IndexBasedMapperBlock
|
11
|
+
from .melt_columns import MeltColumnsBlock
|
12
|
+
from .rename_columns import RenameColumnsBlock
|
13
|
+
from .text_concat import TextConcatBlock
|
14
|
+
from .uniform_col_val_setter import UniformColumnValueSetter
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"TextConcatBlock",
|
18
|
+
"DuplicateColumnsBlock",
|
19
|
+
"MeltColumnsBlock",
|
20
|
+
"IndexBasedMapperBlock",
|
21
|
+
"RenameColumnsBlock",
|
22
|
+
"UniformColumnValueSetter",
|
23
|
+
]
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Duplicate columns block for dataset column duplication operations.
|
3
|
+
|
4
|
+
This module provides a block for duplicating existing columns with new names
|
5
|
+
according to a mapping specification.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"DuplicateColumnsBlock",
|
25
|
+
"transform",
|
26
|
+
"Duplicates existing columns with new names according to a mapping specification",
|
27
|
+
)
|
28
|
+
class DuplicateColumnsBlock(BaseBlock):
|
29
|
+
"""Block for duplicating existing columns with new names.
|
30
|
+
|
31
|
+
This block creates copies of existing columns with new names according to a mapping specification.
|
32
|
+
The mapping is provided through input_cols as a dictionary.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : Dict[str, str]
|
39
|
+
Dictionary mapping existing column names to new column names.
|
40
|
+
Keys are existing column names, values are new column names.
|
41
|
+
"""
|
42
|
+
|
43
|
+
@field_validator("input_cols", mode="after")
|
44
|
+
@classmethod
|
45
|
+
def validate_input_cols(cls, v):
|
46
|
+
"""Validate that input_cols is a non-empty dict."""
|
47
|
+
if not v:
|
48
|
+
raise ValueError("input_cols cannot be empty")
|
49
|
+
if not isinstance(v, dict):
|
50
|
+
raise ValueError(
|
51
|
+
"input_cols must be a dictionary mapping existing column names to new column names"
|
52
|
+
)
|
53
|
+
return v
|
54
|
+
|
55
|
+
def model_post_init(self, __context: Any) -> None:
|
56
|
+
"""Initialize derived attributes after Pydantic validation."""
|
57
|
+
super().model_post_init(__context) if hasattr(
|
58
|
+
super(), "model_post_init"
|
59
|
+
) else None
|
60
|
+
|
61
|
+
# Set output_cols to the new column names being created
|
62
|
+
if self.output_cols is None:
|
63
|
+
self.output_cols = list(self.input_cols.values())
|
64
|
+
|
65
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
66
|
+
"""Generate a dataset with duplicated columns.
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
samples : Dataset
|
71
|
+
Input dataset to duplicate columns from.
|
72
|
+
|
73
|
+
Returns
|
74
|
+
-------
|
75
|
+
Dataset
|
76
|
+
Dataset with additional duplicated columns.
|
77
|
+
"""
|
78
|
+
# Create a copy to avoid modifying the original
|
79
|
+
result = samples
|
80
|
+
|
81
|
+
# Duplicate each column as specified in the mapping
|
82
|
+
for source_col, target_col in self.input_cols.items():
|
83
|
+
if source_col not in result.column_names:
|
84
|
+
raise ValueError(f"Source column '{source_col}' not found in dataset")
|
85
|
+
|
86
|
+
result = result.add_column(target_col, result[source_col])
|
87
|
+
|
88
|
+
return result
|
@@ -0,0 +1,225 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Selector block for column value selection and mapping.
|
3
|
+
|
4
|
+
This module provides a block for selecting and mapping values from one column
|
5
|
+
to another based on a choice column's value.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import Field, field_validator, model_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.error_handling import MissingColumnError
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..registry import BlockRegistry
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"IndexBasedMapperBlock",
|
26
|
+
"transform",
|
27
|
+
"Maps values from source columns to output columns based on choice columns using shared mapping",
|
28
|
+
)
|
29
|
+
class IndexBasedMapperBlock(BaseBlock):
|
30
|
+
"""Block for mapping values from source columns to output columns based on choice columns.
|
31
|
+
|
32
|
+
This block uses a shared mapping dictionary to select values from source columns and
|
33
|
+
store them in output columns based on corresponding choice columns' values.
|
34
|
+
The choice_cols and output_cols must have the same length - choice_cols[i] determines
|
35
|
+
the value for output_cols[i].
|
36
|
+
|
37
|
+
Attributes
|
38
|
+
----------
|
39
|
+
block_name : str
|
40
|
+
Name of the block.
|
41
|
+
input_cols : Union[str, List[str], Dict[str, Any], None]
|
42
|
+
Input column specification. Should include choice columns and mapped columns.
|
43
|
+
output_cols : Union[str, List[str], Dict[str, Any], None]
|
44
|
+
Output column specification. Must have same length as choice_cols.
|
45
|
+
choice_map : Dict[str, str]
|
46
|
+
Dictionary mapping choice values to source column names.
|
47
|
+
choice_cols : List[str]
|
48
|
+
List of column names containing choice values. Must have same length as output_cols.
|
49
|
+
"""
|
50
|
+
|
51
|
+
choice_map: dict[str, str] = Field(
|
52
|
+
..., description="Dictionary mapping choice values to column names"
|
53
|
+
)
|
54
|
+
choice_cols: list[str] = Field(
|
55
|
+
..., description="List of column names containing choice values"
|
56
|
+
)
|
57
|
+
|
58
|
+
@field_validator("choice_map")
|
59
|
+
@classmethod
|
60
|
+
def validate_choice_map(cls, v):
|
61
|
+
"""Validate that choice_map is not empty."""
|
62
|
+
if not v:
|
63
|
+
raise ValueError("choice_map cannot be empty")
|
64
|
+
return v
|
65
|
+
|
66
|
+
@field_validator("choice_cols")
|
67
|
+
@classmethod
|
68
|
+
def validate_choice_cols_not_empty(cls, v):
|
69
|
+
"""Validate that choice_cols is not empty."""
|
70
|
+
if not v:
|
71
|
+
raise ValueError("choice_cols cannot be empty")
|
72
|
+
return v
|
73
|
+
|
74
|
+
@model_validator(mode="after")
|
75
|
+
def validate_input_output_consistency(self):
|
76
|
+
"""Validate that choice_cols and output_cols have same length and consistency."""
|
77
|
+
# Validate equal lengths
|
78
|
+
if len(self.choice_cols) != len(self.output_cols):
|
79
|
+
raise ValueError(
|
80
|
+
f"choice_cols and output_cols must have same length. "
|
81
|
+
f"Got choice_cols: {len(self.choice_cols)}, output_cols: {len(self.output_cols)}"
|
82
|
+
)
|
83
|
+
|
84
|
+
if isinstance(self.input_cols, list):
|
85
|
+
# Check that all choice_cols are in input_cols
|
86
|
+
missing_choice_cols = set(self.choice_cols) - set(self.input_cols)
|
87
|
+
if missing_choice_cols:
|
88
|
+
logger.warning(
|
89
|
+
f"Choice columns {missing_choice_cols} not found in input_cols {self.input_cols}"
|
90
|
+
)
|
91
|
+
|
92
|
+
# Check that all mapped columns are in input_cols
|
93
|
+
missing_mapped_cols = set(self.choice_map.values()) - set(self.input_cols)
|
94
|
+
if missing_mapped_cols:
|
95
|
+
logger.warning(
|
96
|
+
f"Mapped columns {missing_mapped_cols} not found in input_cols {self.input_cols}"
|
97
|
+
)
|
98
|
+
|
99
|
+
return self
|
100
|
+
|
101
|
+
def model_post_init(self, __context: Any) -> None:
|
102
|
+
"""Initialize derived attributes after Pydantic validation."""
|
103
|
+
# Create mapping from choice_col to output_col for easy access
|
104
|
+
self.choice_to_output_map = dict(zip(self.choice_cols, self.output_cols))
|
105
|
+
|
106
|
+
def _validate_custom(self, samples: Dataset) -> None:
|
107
|
+
"""Validate that required columns exist in the dataset.
|
108
|
+
|
109
|
+
Parameters
|
110
|
+
----------
|
111
|
+
samples : Dataset
|
112
|
+
Input dataset to validate.
|
113
|
+
|
114
|
+
Raises
|
115
|
+
------
|
116
|
+
MissingColumnError
|
117
|
+
If required columns are missing from the dataset.
|
118
|
+
ValueError
|
119
|
+
If choice values in data are not found in choice_map.
|
120
|
+
"""
|
121
|
+
# Check that all choice_cols exist
|
122
|
+
missing_choice_cols = [
|
123
|
+
col for col in self.choice_cols if col not in samples.column_names
|
124
|
+
]
|
125
|
+
if missing_choice_cols:
|
126
|
+
raise MissingColumnError(
|
127
|
+
block_name=self.block_name,
|
128
|
+
missing_columns=missing_choice_cols,
|
129
|
+
available_columns=samples.column_names,
|
130
|
+
)
|
131
|
+
|
132
|
+
# Check that all mapped columns exist
|
133
|
+
mapped_cols = list(self.choice_map.values())
|
134
|
+
missing_cols = list(set(mapped_cols) - set(samples.column_names))
|
135
|
+
if missing_cols:
|
136
|
+
raise MissingColumnError(
|
137
|
+
block_name=self.block_name,
|
138
|
+
missing_columns=missing_cols,
|
139
|
+
available_columns=samples.column_names,
|
140
|
+
)
|
141
|
+
|
142
|
+
# Check that all choice values in all choice columns have corresponding mappings
|
143
|
+
all_unique_choices = set()
|
144
|
+
for choice_col in self.choice_cols:
|
145
|
+
all_unique_choices.update(samples[choice_col])
|
146
|
+
|
147
|
+
mapped_choices = set(self.choice_map.keys())
|
148
|
+
unmapped_choices = all_unique_choices - mapped_choices
|
149
|
+
|
150
|
+
if unmapped_choices:
|
151
|
+
raise ValueError(
|
152
|
+
f"Choice values {sorted(unmapped_choices)} not found in choice_map for block '{self.block_name}'. "
|
153
|
+
f"Available choices in mapping: {sorted(mapped_choices)}"
|
154
|
+
)
|
155
|
+
|
156
|
+
def _generate(self, sample: dict[str, Any]) -> dict[str, Any]:
|
157
|
+
"""Generate a new sample by selecting values based on choice mapping.
|
158
|
+
|
159
|
+
Parameters
|
160
|
+
----------
|
161
|
+
sample : Dict[str, Any]
|
162
|
+
Input sample to process.
|
163
|
+
|
164
|
+
Returns
|
165
|
+
-------
|
166
|
+
Dict[str, Any]
|
167
|
+
Sample with selected values stored in corresponding output columns.
|
168
|
+
"""
|
169
|
+
for choice_col, output_col in self.choice_to_output_map.items():
|
170
|
+
choice_value = sample[choice_col]
|
171
|
+
source_col = self.choice_map[
|
172
|
+
choice_value
|
173
|
+
] # Safe since validated in _validate_custom
|
174
|
+
sample[output_col] = sample[source_col]
|
175
|
+
return sample
|
176
|
+
|
177
|
+
def generate(self, samples: Dataset) -> Dataset:
|
178
|
+
"""Generate a new dataset with selected values.
|
179
|
+
|
180
|
+
Parameters
|
181
|
+
----------
|
182
|
+
samples : Dataset
|
183
|
+
Input dataset to process.
|
184
|
+
|
185
|
+
Returns
|
186
|
+
-------
|
187
|
+
Dataset
|
188
|
+
Dataset with selected values stored in output column.
|
189
|
+
"""
|
190
|
+
# Log the operation
|
191
|
+
all_unique_choices = set()
|
192
|
+
for choice_col in self.choice_cols:
|
193
|
+
all_unique_choices.update(samples[choice_col])
|
194
|
+
mapped_choices = set(self.choice_map.keys())
|
195
|
+
|
196
|
+
logger.info(
|
197
|
+
f"Mapping values based on choice columns for block '{self.block_name}'",
|
198
|
+
extra={
|
199
|
+
"block_name": self.block_name,
|
200
|
+
"choice_columns": self.choice_cols,
|
201
|
+
"output_columns": self.output_cols,
|
202
|
+
"choice_mappings": len(self.choice_map),
|
203
|
+
"unique_choices_in_data": len(all_unique_choices),
|
204
|
+
"unmapped_choices": len(all_unique_choices - mapped_choices),
|
205
|
+
},
|
206
|
+
)
|
207
|
+
|
208
|
+
# Apply the mapping
|
209
|
+
result = samples.map(self._generate)
|
210
|
+
|
211
|
+
# Log completion
|
212
|
+
logger.info(
|
213
|
+
f"Successfully applied choice mapping for block '{self.block_name}'",
|
214
|
+
extra={
|
215
|
+
"block_name": self.block_name,
|
216
|
+
"rows_processed": len(result),
|
217
|
+
"output_columns": self.output_cols,
|
218
|
+
"mapping_coverage": len(mapped_choices & all_unique_choices)
|
219
|
+
/ len(all_unique_choices)
|
220
|
+
if all_unique_choices
|
221
|
+
else 0,
|
222
|
+
},
|
223
|
+
)
|
224
|
+
|
225
|
+
return result
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Melt columns block for wide-to-long format transformation.
|
3
|
+
|
4
|
+
This module provides a block for transforming wide dataset format into long format
|
5
|
+
by melting specified columns into rows.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.error_handling import MissingColumnError
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..registry import BlockRegistry
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"MeltColumnsBlock",
|
26
|
+
"transform",
|
27
|
+
"Transforms wide dataset format into long format by melting columns into rows",
|
28
|
+
)
|
29
|
+
class MeltColumnsBlock(BaseBlock):
|
30
|
+
"""Block for flattening multiple columns into a long format.
|
31
|
+
|
32
|
+
This block transforms a wide dataset format into a long format by melting
|
33
|
+
specified columns into rows, creating new variable and value columns.
|
34
|
+
|
35
|
+
The input_cols should contain the columns to be melted (variable columns).
|
36
|
+
The output_cols must specify exactly two columns: [value_column, variable_column].
|
37
|
+
Any other columns in the dataset will be treated as ID columns and preserved.
|
38
|
+
|
39
|
+
Attributes
|
40
|
+
----------
|
41
|
+
block_name : str
|
42
|
+
Name of the block.
|
43
|
+
input_cols : Union[str, List[str], Dict[str, Any], None]
|
44
|
+
Columns to be melted into rows (variable columns).
|
45
|
+
output_cols : Union[str, List[str], Dict[str, Any], None]
|
46
|
+
Output column specification. Must specify exactly two columns: [value_column, variable_column].
|
47
|
+
"""
|
48
|
+
|
49
|
+
@field_validator("input_cols", mode="after")
|
50
|
+
@classmethod
|
51
|
+
def validate_input_cols(cls, v):
|
52
|
+
"""Validate that input_cols is not empty."""
|
53
|
+
if not v:
|
54
|
+
raise ValueError("input_cols cannot be empty")
|
55
|
+
return v
|
56
|
+
|
57
|
+
@field_validator("output_cols", mode="after")
|
58
|
+
@classmethod
|
59
|
+
def validate_output_cols(cls, v):
|
60
|
+
"""Validate that exactly two output columns are specified."""
|
61
|
+
if len(v) != 2:
|
62
|
+
raise ValueError(
|
63
|
+
f"MeltColumnsBlock expects exactly two output columns (value, variable), got {len(v)}: {v}"
|
64
|
+
)
|
65
|
+
return v
|
66
|
+
|
67
|
+
def model_post_init(self, __context: Any) -> None:
|
68
|
+
"""Initialize derived attributes after Pydantic validation."""
|
69
|
+
super().model_post_init(__context) if hasattr(
|
70
|
+
super(), "model_post_init"
|
71
|
+
) else None
|
72
|
+
|
73
|
+
# Derive value and variable column names from output_cols
|
74
|
+
self.value_name = self.output_cols[0] # First output column is value
|
75
|
+
self.var_name = self.output_cols[1] # Second output column is variable
|
76
|
+
|
77
|
+
# input_cols contains the columns to be melted (what was var_cols)
|
78
|
+
self.var_cols = (
|
79
|
+
self.input_cols if isinstance(self.input_cols, list) else [self.input_cols]
|
80
|
+
)
|
81
|
+
|
82
|
+
def _validate_custom(self, samples: Dataset) -> None:
|
83
|
+
"""Validate that required columns exist in the dataset.
|
84
|
+
|
85
|
+
Parameters
|
86
|
+
----------
|
87
|
+
samples : Dataset
|
88
|
+
Input dataset to validate.
|
89
|
+
|
90
|
+
Raises
|
91
|
+
------
|
92
|
+
MissingColumnError
|
93
|
+
If required columns are missing from the dataset.
|
94
|
+
"""
|
95
|
+
# Check that all var_cols exist in the dataset
|
96
|
+
missing_cols = list(set(self.var_cols) - set(samples.column_names))
|
97
|
+
if missing_cols:
|
98
|
+
raise MissingColumnError(
|
99
|
+
block_name=self.block_name,
|
100
|
+
missing_columns=missing_cols,
|
101
|
+
available_columns=samples.column_names,
|
102
|
+
)
|
103
|
+
|
104
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
105
|
+
"""Generate a flattened dataset in long format.
|
106
|
+
|
107
|
+
Parameters
|
108
|
+
----------
|
109
|
+
samples : Dataset
|
110
|
+
Input dataset to flatten.
|
111
|
+
|
112
|
+
Returns
|
113
|
+
-------
|
114
|
+
Dataset
|
115
|
+
Flattened dataset in long format with new variable and value columns.
|
116
|
+
"""
|
117
|
+
# Use the original simple logic - just adapted to use derived attributes
|
118
|
+
df = samples.to_pandas()
|
119
|
+
id_cols = [col for col in samples.column_names if col not in self.var_cols]
|
120
|
+
flatten_df = df.melt(
|
121
|
+
id_vars=id_cols,
|
122
|
+
value_vars=self.var_cols,
|
123
|
+
value_name=self.value_name,
|
124
|
+
var_name=self.var_name,
|
125
|
+
)
|
126
|
+
return Dataset.from_pandas(flatten_df)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Rename columns block for dataset column renaming operations.
|
3
|
+
|
4
|
+
This module provides a block for renaming columns in datasets according
|
5
|
+
to a mapping specification.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"RenameColumnsBlock",
|
25
|
+
"transform",
|
26
|
+
"Renames columns in a dataset according to a mapping specification",
|
27
|
+
)
|
28
|
+
class RenameColumnsBlock(BaseBlock):
|
29
|
+
"""Block for renaming columns in a dataset.
|
30
|
+
|
31
|
+
This block renames columns in a dataset according to a mapping specification.
|
32
|
+
The mapping is provided through input_cols as a dictionary.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : Dict[str, str]
|
39
|
+
Dictionary mapping existing column names to new column names.
|
40
|
+
Keys are existing column names, values are new column names.
|
41
|
+
"""
|
42
|
+
|
43
|
+
@field_validator("input_cols", mode="after")
|
44
|
+
@classmethod
|
45
|
+
def validate_input_cols(cls, v):
|
46
|
+
"""Validate that input_cols is a non-empty dict."""
|
47
|
+
if not v:
|
48
|
+
raise ValueError("input_cols cannot be empty")
|
49
|
+
if not isinstance(v, dict):
|
50
|
+
raise ValueError(
|
51
|
+
"input_cols must be a dictionary mapping old column names to new column names"
|
52
|
+
)
|
53
|
+
return v
|
54
|
+
|
55
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
56
|
+
"""Generate a dataset with renamed columns.
|
57
|
+
|
58
|
+
Parameters
|
59
|
+
----------
|
60
|
+
samples : Dataset
|
61
|
+
Input dataset to rename columns in.
|
62
|
+
|
63
|
+
Returns
|
64
|
+
-------
|
65
|
+
Dataset
|
66
|
+
Dataset with renamed columns.
|
67
|
+
"""
|
68
|
+
# Rename columns using HuggingFace datasets method
|
69
|
+
return samples.rename_columns(self.input_cols)
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Text concatenation block for dataset column combination operations.
|
3
|
+
|
4
|
+
This module provides a block for combining multiple columns into a single column
|
5
|
+
using a specified separator.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import Field, field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"TextConcatBlock",
|
25
|
+
"transform",
|
26
|
+
"Combines multiple columns into a single column using a specified separator",
|
27
|
+
)
|
28
|
+
class TextConcatBlock(BaseBlock):
|
29
|
+
"""Block for combining multiple columns into a single column.
|
30
|
+
|
31
|
+
This block concatenates values from multiple columns into a single output column,
|
32
|
+
using a specified separator between values.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : list[str]
|
39
|
+
List of column names to combine.
|
40
|
+
output_cols : list[str]
|
41
|
+
List containing the single output column name.
|
42
|
+
separator : str
|
43
|
+
String to use as separator between combined values.
|
44
|
+
"""
|
45
|
+
|
46
|
+
separator: str = Field(
|
47
|
+
default="\n\n", description="Separator to use between combined values"
|
48
|
+
)
|
49
|
+
|
50
|
+
@field_validator("input_cols", mode="after")
|
51
|
+
@classmethod
|
52
|
+
def validate_input_cols(cls, v):
|
53
|
+
"""Validate that input_cols is a non-empty list."""
|
54
|
+
if not v:
|
55
|
+
raise ValueError("input_cols cannot be empty")
|
56
|
+
if not isinstance(v, list):
|
57
|
+
raise ValueError("input_cols must be a list of column names")
|
58
|
+
return v
|
59
|
+
|
60
|
+
@field_validator("output_cols", mode="after")
|
61
|
+
@classmethod
|
62
|
+
def validate_output_cols(cls, v):
|
63
|
+
"""Validate that exactly one output column is specified."""
|
64
|
+
if not v or len(v) != 1:
|
65
|
+
raise ValueError("TextConcatBlock requires exactly one output column")
|
66
|
+
return v
|
67
|
+
|
68
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
69
|
+
"""Generate a dataset with combined columns.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
----------
|
73
|
+
samples : Dataset
|
74
|
+
Input dataset to process.
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
Dataset
|
79
|
+
Dataset with combined values stored in output column.
|
80
|
+
"""
|
81
|
+
if not self.output_cols:
|
82
|
+
raise ValueError("output_cols must be specified")
|
83
|
+
|
84
|
+
output_col = self.output_cols[0]
|
85
|
+
|
86
|
+
def _combine_columns(sample):
|
87
|
+
"""Combine values from input columns."""
|
88
|
+
# Check that all input columns exist
|
89
|
+
for col in self.input_cols:
|
90
|
+
if col not in sample:
|
91
|
+
raise ValueError(f"Input column '{col}' not found in sample")
|
92
|
+
|
93
|
+
# Combine values using separator
|
94
|
+
combined_value = self.separator.join(
|
95
|
+
[str(sample[col]) for col in self.input_cols]
|
96
|
+
)
|
97
|
+
sample[output_col] = combined_value
|
98
|
+
return sample
|
99
|
+
|
100
|
+
# Apply the combination to all samples
|
101
|
+
result = samples.map(_combine_columns)
|
102
|
+
return result
|