sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +25 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +980 -0
- sdg_hub/core/flow/metadata.py +344 -0
- sdg_hub/core/flow/migration.py +187 -0
- sdg_hub/core/flow/registry.py +330 -0
- sdg_hub/core/flow/validation.py +265 -0
- sdg_hub/{utils → core/utils}/__init__.py +6 -4
- sdg_hub/{utils → core/utils}/datautils.py +1 -3
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.0.dist-info/METADATA +218 -0
- sdg_hub-0.2.0.dist-info/RECORD +63 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Melt columns block for wide-to-long format transformation.
|
3
|
+
|
4
|
+
This module provides a block for transforming wide dataset format into long format
|
5
|
+
by melting specified columns into rows.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.error_handling import MissingColumnError
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..registry import BlockRegistry
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"MeltColumnsBlock",
|
26
|
+
"transform",
|
27
|
+
"Transforms wide dataset format into long format by melting columns into rows",
|
28
|
+
)
|
29
|
+
class MeltColumnsBlock(BaseBlock):
|
30
|
+
"""Block for flattening multiple columns into a long format.
|
31
|
+
|
32
|
+
This block transforms a wide dataset format into a long format by melting
|
33
|
+
specified columns into rows, creating new variable and value columns.
|
34
|
+
|
35
|
+
The input_cols should contain the columns to be melted (variable columns).
|
36
|
+
The output_cols must specify exactly two columns: [value_column, variable_column].
|
37
|
+
Any other columns in the dataset will be treated as ID columns and preserved.
|
38
|
+
|
39
|
+
Attributes
|
40
|
+
----------
|
41
|
+
block_name : str
|
42
|
+
Name of the block.
|
43
|
+
input_cols : Union[str, List[str], Dict[str, Any], None]
|
44
|
+
Columns to be melted into rows (variable columns).
|
45
|
+
output_cols : Union[str, List[str], Dict[str, Any], None]
|
46
|
+
Output column specification. Must specify exactly two columns: [value_column, variable_column].
|
47
|
+
"""
|
48
|
+
|
49
|
+
@field_validator("input_cols", mode="after")
|
50
|
+
@classmethod
|
51
|
+
def validate_input_cols(cls, v):
|
52
|
+
"""Validate that input_cols is not empty."""
|
53
|
+
if not v:
|
54
|
+
raise ValueError("input_cols cannot be empty")
|
55
|
+
return v
|
56
|
+
|
57
|
+
@field_validator("output_cols", mode="after")
|
58
|
+
@classmethod
|
59
|
+
def validate_output_cols(cls, v):
|
60
|
+
"""Validate that exactly two output columns are specified."""
|
61
|
+
if len(v) != 2:
|
62
|
+
raise ValueError(
|
63
|
+
f"MeltColumnsBlock expects exactly two output columns (value, variable), got {len(v)}: {v}"
|
64
|
+
)
|
65
|
+
return v
|
66
|
+
|
67
|
+
def model_post_init(self, __context: Any) -> None:
|
68
|
+
"""Initialize derived attributes after Pydantic validation."""
|
69
|
+
super().model_post_init(__context) if hasattr(
|
70
|
+
super(), "model_post_init"
|
71
|
+
) else None
|
72
|
+
|
73
|
+
# Derive value and variable column names from output_cols
|
74
|
+
self.value_name = self.output_cols[0] # First output column is value
|
75
|
+
self.var_name = self.output_cols[1] # Second output column is variable
|
76
|
+
|
77
|
+
# input_cols contains the columns to be melted (what was var_cols)
|
78
|
+
self.var_cols = (
|
79
|
+
self.input_cols if isinstance(self.input_cols, list) else [self.input_cols]
|
80
|
+
)
|
81
|
+
|
82
|
+
def _validate_custom(self, samples: Dataset) -> None:
|
83
|
+
"""Validate that required columns exist in the dataset.
|
84
|
+
|
85
|
+
Parameters
|
86
|
+
----------
|
87
|
+
samples : Dataset
|
88
|
+
Input dataset to validate.
|
89
|
+
|
90
|
+
Raises
|
91
|
+
------
|
92
|
+
MissingColumnError
|
93
|
+
If required columns are missing from the dataset.
|
94
|
+
"""
|
95
|
+
# Check that all var_cols exist in the dataset
|
96
|
+
missing_cols = list(set(self.var_cols) - set(samples.column_names))
|
97
|
+
if missing_cols:
|
98
|
+
raise MissingColumnError(
|
99
|
+
block_name=self.block_name,
|
100
|
+
missing_columns=missing_cols,
|
101
|
+
available_columns=samples.column_names,
|
102
|
+
)
|
103
|
+
|
104
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
105
|
+
"""Generate a flattened dataset in long format.
|
106
|
+
|
107
|
+
Parameters
|
108
|
+
----------
|
109
|
+
samples : Dataset
|
110
|
+
Input dataset to flatten.
|
111
|
+
|
112
|
+
Returns
|
113
|
+
-------
|
114
|
+
Dataset
|
115
|
+
Flattened dataset in long format with new variable and value columns.
|
116
|
+
"""
|
117
|
+
# Use the original simple logic - just adapted to use derived attributes
|
118
|
+
df = samples.to_pandas()
|
119
|
+
id_cols = [col for col in samples.column_names if col not in self.var_cols]
|
120
|
+
flatten_df = df.melt(
|
121
|
+
id_vars=id_cols,
|
122
|
+
value_vars=self.var_cols,
|
123
|
+
value_name=self.value_name,
|
124
|
+
var_name=self.var_name,
|
125
|
+
)
|
126
|
+
return Dataset.from_pandas(flatten_df)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Rename columns block for dataset column renaming operations.
|
3
|
+
|
4
|
+
This module provides a block for renaming columns in datasets according
|
5
|
+
to a mapping specification.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"RenameColumnsBlock",
|
25
|
+
"transform",
|
26
|
+
"Renames columns in a dataset according to a mapping specification",
|
27
|
+
)
|
28
|
+
class RenameColumnsBlock(BaseBlock):
|
29
|
+
"""Block for renaming columns in a dataset.
|
30
|
+
|
31
|
+
This block renames columns in a dataset according to a mapping specification.
|
32
|
+
The mapping is provided through input_cols as a dictionary.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : Dict[str, str]
|
39
|
+
Dictionary mapping existing column names to new column names.
|
40
|
+
Keys are existing column names, values are new column names.
|
41
|
+
"""
|
42
|
+
|
43
|
+
@field_validator("input_cols", mode="after")
|
44
|
+
@classmethod
|
45
|
+
def validate_input_cols(cls, v):
|
46
|
+
"""Validate that input_cols is a non-empty dict."""
|
47
|
+
if not v:
|
48
|
+
raise ValueError("input_cols cannot be empty")
|
49
|
+
if not isinstance(v, dict):
|
50
|
+
raise ValueError(
|
51
|
+
"input_cols must be a dictionary mapping old column names to new column names"
|
52
|
+
)
|
53
|
+
return v
|
54
|
+
|
55
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
56
|
+
"""Generate a dataset with renamed columns.
|
57
|
+
|
58
|
+
Parameters
|
59
|
+
----------
|
60
|
+
samples : Dataset
|
61
|
+
Input dataset to rename columns in.
|
62
|
+
|
63
|
+
Returns
|
64
|
+
-------
|
65
|
+
Dataset
|
66
|
+
Dataset with renamed columns.
|
67
|
+
"""
|
68
|
+
# Rename columns using HuggingFace datasets method
|
69
|
+
return samples.rename_columns(self.input_cols)
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Text concatenation block for dataset column combination operations.
|
3
|
+
|
4
|
+
This module provides a block for combining multiple columns into a single column
|
5
|
+
using a specified separator.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import Field, field_validator
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
@BlockRegistry.register(
|
24
|
+
"TextConcatBlock",
|
25
|
+
"transform",
|
26
|
+
"Combines multiple columns into a single column using a specified separator",
|
27
|
+
)
|
28
|
+
class TextConcatBlock(BaseBlock):
|
29
|
+
"""Block for combining multiple columns into a single column.
|
30
|
+
|
31
|
+
This block concatenates values from multiple columns into a single output column,
|
32
|
+
using a specified separator between values.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : list[str]
|
39
|
+
List of column names to combine.
|
40
|
+
output_cols : list[str]
|
41
|
+
List containing the single output column name.
|
42
|
+
separator : str
|
43
|
+
String to use as separator between combined values.
|
44
|
+
"""
|
45
|
+
|
46
|
+
separator: str = Field(
|
47
|
+
default="\n\n", description="Separator to use between combined values"
|
48
|
+
)
|
49
|
+
|
50
|
+
@field_validator("input_cols", mode="after")
|
51
|
+
@classmethod
|
52
|
+
def validate_input_cols(cls, v):
|
53
|
+
"""Validate that input_cols is a non-empty list."""
|
54
|
+
if not v:
|
55
|
+
raise ValueError("input_cols cannot be empty")
|
56
|
+
if not isinstance(v, list):
|
57
|
+
raise ValueError("input_cols must be a list of column names")
|
58
|
+
return v
|
59
|
+
|
60
|
+
@field_validator("output_cols", mode="after")
|
61
|
+
@classmethod
|
62
|
+
def validate_output_cols(cls, v):
|
63
|
+
"""Validate that exactly one output column is specified."""
|
64
|
+
if not v or len(v) != 1:
|
65
|
+
raise ValueError("TextConcatBlock requires exactly one output column")
|
66
|
+
return v
|
67
|
+
|
68
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
69
|
+
"""Generate a dataset with combined columns.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
----------
|
73
|
+
samples : Dataset
|
74
|
+
Input dataset to process.
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
Dataset
|
79
|
+
Dataset with combined values stored in output column.
|
80
|
+
"""
|
81
|
+
if not self.output_cols:
|
82
|
+
raise ValueError("output_cols must be specified")
|
83
|
+
|
84
|
+
output_col = self.output_cols[0]
|
85
|
+
|
86
|
+
def _combine_columns(sample):
|
87
|
+
"""Combine values from input columns."""
|
88
|
+
# Check that all input columns exist
|
89
|
+
for col in self.input_cols:
|
90
|
+
if col not in sample:
|
91
|
+
raise ValueError(f"Input column '{col}' not found in sample")
|
92
|
+
|
93
|
+
# Combine values using separator
|
94
|
+
combined_value = self.separator.join(
|
95
|
+
[str(sample[col]) for col in self.input_cols]
|
96
|
+
)
|
97
|
+
sample[output_col] = combined_value
|
98
|
+
return sample
|
99
|
+
|
100
|
+
# Apply the combination to all samples
|
101
|
+
result = samples.map(_combine_columns)
|
102
|
+
return result
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Uniform column value setter block for replacing a column with a single statistic.
|
3
|
+
|
4
|
+
This block sets all values in a column to a single summary statistic:
|
5
|
+
mode, min, max, mean, or median.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any, Literal
|
10
|
+
|
11
|
+
# Third Party
|
12
|
+
from datasets import Dataset
|
13
|
+
from pydantic import field_validator
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
# Local
|
17
|
+
from ...utils.logger_config import setup_logger
|
18
|
+
from ..base import BaseBlock
|
19
|
+
from ..registry import BlockRegistry
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"UniformColumnValueSetter",
|
26
|
+
"transform",
|
27
|
+
"Replaces all values in a column with a single summary statistic (e.g., mode, mean, median)",
|
28
|
+
)
|
29
|
+
class UniformColumnValueSetter(BaseBlock):
|
30
|
+
"""Block that replaces all values in a column with a single aggregate value.
|
31
|
+
|
32
|
+
Supported strategies include: mode, min, max, mean, median.
|
33
|
+
|
34
|
+
Attributes
|
35
|
+
----------
|
36
|
+
block_name : str
|
37
|
+
Name of the block.
|
38
|
+
input_cols : Union[str, List[str]]
|
39
|
+
Must specify exactly one input column.
|
40
|
+
output_cols : Union[str, List[str]]
|
41
|
+
Output column list. Ignored — modifies in place.
|
42
|
+
reduction_strategy : Literal["mode", "min", "max", "mean", "median"]
|
43
|
+
Strategy used to compute the replacement value.
|
44
|
+
"""
|
45
|
+
|
46
|
+
reduction_strategy: Literal["mode", "min", "max", "mean", "median"] = "mode"
|
47
|
+
|
48
|
+
@field_validator("input_cols", mode="after")
|
49
|
+
@classmethod
|
50
|
+
def validate_input_cols_single(cls, v):
|
51
|
+
if not v or len(v) != 1:
|
52
|
+
raise ValueError(
|
53
|
+
"UniformColumnValueSetter requires exactly one input column"
|
54
|
+
)
|
55
|
+
return v
|
56
|
+
|
57
|
+
def model_post_init(self, __context: Any) -> None:
|
58
|
+
if hasattr(super(), "model_post_init"):
|
59
|
+
super().model_post_init(__context)
|
60
|
+
|
61
|
+
if self.output_cols and len(self.output_cols) > 0:
|
62
|
+
logger.warning(
|
63
|
+
f"UniformColumnValueSetter modifies columns in-place. "
|
64
|
+
f"Specified output_cols {self.output_cols} will be ignored."
|
65
|
+
)
|
66
|
+
self.output_cols = []
|
67
|
+
self.col_name = self.input_cols[0]
|
68
|
+
|
69
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
70
|
+
df = samples.to_pandas()
|
71
|
+
|
72
|
+
if df.empty:
|
73
|
+
raise ValueError("Cannot compute reduction for empty dataset")
|
74
|
+
|
75
|
+
col = df[self.col_name]
|
76
|
+
|
77
|
+
strategy = self.reduction_strategy
|
78
|
+
if strategy == "mode":
|
79
|
+
value = col.mode().iloc[0] if not col.mode().empty else None
|
80
|
+
elif strategy == "min":
|
81
|
+
value = col.min()
|
82
|
+
elif strategy == "max":
|
83
|
+
value = col.max()
|
84
|
+
elif strategy == "mean":
|
85
|
+
value = col.mean()
|
86
|
+
elif strategy == "median":
|
87
|
+
value = col.median()
|
88
|
+
else:
|
89
|
+
raise ValueError(f"Unsupported reduction strategy: {strategy}")
|
90
|
+
|
91
|
+
if value is None or (isinstance(value, float) and np.isnan(value)):
|
92
|
+
raise ValueError(
|
93
|
+
f"Could not compute {strategy} for column '{self.col_name}'"
|
94
|
+
)
|
95
|
+
|
96
|
+
logger.info(
|
97
|
+
f"Replacing all values in column '{self.col_name}' with {strategy} value: '{value}'"
|
98
|
+
)
|
99
|
+
|
100
|
+
df[self.col_name] = value
|
101
|
+
return Dataset.from_pandas(df)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""New flow implementation for SDG Hub.
|
3
|
+
|
4
|
+
This module provides a redesigned Flow class with metadata support,
|
5
|
+
dual initialization modes, and runtime parameter overrides.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Local
|
9
|
+
from .base import Flow
|
10
|
+
from .metadata import FlowMetadata, FlowParameter
|
11
|
+
from .registry import FlowRegistry
|
12
|
+
from .validation import FlowValidator
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"Flow",
|
16
|
+
"FlowMetadata",
|
17
|
+
"FlowParameter",
|
18
|
+
"FlowRegistry",
|
19
|
+
"FlowValidator",
|
20
|
+
]
|