sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +25 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +980 -0
- sdg_hub/core/flow/metadata.py +344 -0
- sdg_hub/core/flow/migration.py +187 -0
- sdg_hub/core/flow/registry.py +330 -0
- sdg_hub/core/flow/validation.py +265 -0
- sdg_hub/{utils → core/utils}/__init__.py +6 -4
- sdg_hub/{utils → core/utils}/datautils.py +1 -3
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.0.dist-info/METADATA +218 -0
- sdg_hub-0.2.0.dist-info/RECORD +63 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
sdg_hub/__init__.py
CHANGED
@@ -1,3 +1,30 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""SDG Hub - Synthetic Data Generation Framework."""
|
3
|
+
|
2
4
|
# Local
|
3
|
-
|
5
|
+
# Local
|
6
|
+
from .core import (
|
7
|
+
BaseBlock,
|
8
|
+
BlockRegistry,
|
9
|
+
Flow,
|
10
|
+
FlowMetadata,
|
11
|
+
FlowParameter,
|
12
|
+
FlowRegistry,
|
13
|
+
FlowValidator,
|
14
|
+
GenerateError,
|
15
|
+
resolve_path,
|
16
|
+
)
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
# Core framework classes (top-level access)
|
20
|
+
"BaseBlock",
|
21
|
+
"BlockRegistry",
|
22
|
+
"Flow",
|
23
|
+
"FlowRegistry",
|
24
|
+
# Metadata and utilities
|
25
|
+
"FlowMetadata",
|
26
|
+
"FlowParameter",
|
27
|
+
"FlowValidator",
|
28
|
+
"GenerateError",
|
29
|
+
"resolve_path",
|
30
|
+
]
|
sdg_hub/_version.py
CHANGED
sdg_hub/core/__init__.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Core SDG Hub components."""
|
3
|
+
|
4
|
+
# Local
|
5
|
+
from .blocks import BaseBlock, BlockRegistry
|
6
|
+
from .flow import Flow, FlowMetadata, FlowParameter, FlowRegistry, FlowValidator
|
7
|
+
from .utils import GenerateError, resolve_path
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
# Block components
|
11
|
+
"BaseBlock",
|
12
|
+
"BlockRegistry",
|
13
|
+
# Flow components
|
14
|
+
"Flow",
|
15
|
+
"FlowRegistry",
|
16
|
+
"FlowMetadata",
|
17
|
+
"FlowParameter",
|
18
|
+
"FlowValidator",
|
19
|
+
# Utils
|
20
|
+
"GenerateError",
|
21
|
+
"resolve_path",
|
22
|
+
]
|
@@ -0,0 +1,58 @@
|
|
1
|
+
"""Block implementations for SDG Hub.
|
2
|
+
|
3
|
+
This package provides various block implementations for data generation, processing, and transformation.
|
4
|
+
"""
|
5
|
+
|
6
|
+
# Local
|
7
|
+
from .base import BaseBlock
|
8
|
+
from .deprecated_blocks import (
|
9
|
+
CombineColumnsBlock,
|
10
|
+
DuplicateColumns,
|
11
|
+
FilterByValueBlock,
|
12
|
+
FlattenColumnsBlock,
|
13
|
+
LLMBlock,
|
14
|
+
RenameColumns,
|
15
|
+
SamplePopulatorBlock,
|
16
|
+
SelectorBlock,
|
17
|
+
SetToMajorityValue,
|
18
|
+
)
|
19
|
+
from .evaluation import EvaluateFaithfulnessBlock, EvaluateRelevancyBlock
|
20
|
+
from .filtering import ColumnValueFilterBlock
|
21
|
+
from .llm import LLMChatBlock, PromptBuilderBlock, TextParserBlock
|
22
|
+
from .registry import BlockRegistry
|
23
|
+
from .transform import (
|
24
|
+
DuplicateColumnsBlock,
|
25
|
+
IndexBasedMapperBlock,
|
26
|
+
MeltColumnsBlock,
|
27
|
+
RenameColumnsBlock,
|
28
|
+
TextConcatBlock,
|
29
|
+
UniformColumnValueSetter,
|
30
|
+
)
|
31
|
+
|
32
|
+
# All blocks moved to deprecated_blocks or transform modules
|
33
|
+
|
34
|
+
__all__ = [
|
35
|
+
"BaseBlock",
|
36
|
+
"BlockRegistry",
|
37
|
+
"ColumnValueFilterBlock",
|
38
|
+
"DuplicateColumnsBlock",
|
39
|
+
"IndexBasedMapperBlock",
|
40
|
+
"MeltColumnsBlock",
|
41
|
+
"RenameColumnsBlock",
|
42
|
+
"TextConcatBlock",
|
43
|
+
"UniformColumnValueSetter",
|
44
|
+
"CombineColumnsBlock", # Deprecated
|
45
|
+
"DuplicateColumns", # Deprecated
|
46
|
+
"FilterByValueBlock", # Deprecated
|
47
|
+
"FlattenColumnsBlock", # Deprecated
|
48
|
+
"RenameColumns", # Deprecated
|
49
|
+
"SamplePopulatorBlock", # Deprecated
|
50
|
+
"SelectorBlock", # Deprecated
|
51
|
+
"SetToMajorityValue", # Deprecated
|
52
|
+
"LLMBlock", # Deprecated
|
53
|
+
"LLMChatBlock",
|
54
|
+
"TextParserBlock",
|
55
|
+
"PromptBuilderBlock",
|
56
|
+
"EvaluateFaithfulnessBlock",
|
57
|
+
"EvaluateRelevancyBlock",
|
58
|
+
]
|
@@ -0,0 +1,313 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Enhanced base block implementation with standardized patterns.
|
3
|
+
|
4
|
+
This module provides a comprehensive base class for all blocks in the system,
|
5
|
+
with unified constructor patterns, column handling, and common functionality.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from abc import ABC, abstractmethod
|
10
|
+
from typing import Any, Optional, Union
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
15
|
+
from rich.console import Console
|
16
|
+
from rich.panel import Panel
|
17
|
+
from rich.text import Text
|
18
|
+
|
19
|
+
# Local
|
20
|
+
from ..utils.error_handling import (
|
21
|
+
EmptyDatasetError,
|
22
|
+
MissingColumnError,
|
23
|
+
OutputColumnCollisionError,
|
24
|
+
)
|
25
|
+
from ..utils.logger_config import setup_logger
|
26
|
+
|
27
|
+
logger = setup_logger(__name__)
|
28
|
+
console = Console()
|
29
|
+
|
30
|
+
|
31
|
+
class BaseBlock(BaseModel, ABC):
|
32
|
+
"""Base class for all blocks, with standardized patterns and full Pydantic compatibility.
|
33
|
+
|
34
|
+
This class defines a unified, configurable base for building composable data processing blocks
|
35
|
+
that operate over HuggingFace Datasets. It supports field-based initialization, validation,
|
36
|
+
and rich logging for inputs and outputs.
|
37
|
+
|
38
|
+
Attributes
|
39
|
+
----------
|
40
|
+
block_name : str
|
41
|
+
Unique identifier for this block instance.
|
42
|
+
input_cols : Union[List[str], Dict[str, Any]]
|
43
|
+
Input columns from the dataset (string, list of strings, or mapping).
|
44
|
+
output_cols : Union[List[str], Dict[str, Any]]
|
45
|
+
Output columns to write to the dataset (string, list of strings, or mapping).
|
46
|
+
"""
|
47
|
+
|
48
|
+
block_name: str = Field(
|
49
|
+
..., description="Unique identifier for this block instance"
|
50
|
+
)
|
51
|
+
input_cols: Union[str, list[str], dict[str, Any], None] = Field(
|
52
|
+
None, description="Input columns: str, list, or dict"
|
53
|
+
)
|
54
|
+
output_cols: Union[str, list[str], dict[str, Any], None] = Field(
|
55
|
+
None, description="Output columns: str, list, or dict"
|
56
|
+
)
|
57
|
+
|
58
|
+
# Allow extra config fields and complex types like Dataset
|
59
|
+
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
|
60
|
+
|
61
|
+
# Normalize input columns before model construction
|
62
|
+
@field_validator("input_cols", mode="before")
|
63
|
+
@classmethod
|
64
|
+
def normalize_input_cols(cls, v):
|
65
|
+
return BaseBlock._normalize_columns(v)
|
66
|
+
|
67
|
+
# Normalize output columns before model construction
|
68
|
+
@field_validator("output_cols", mode="before")
|
69
|
+
@classmethod
|
70
|
+
def normalize_output_cols(cls, v):
|
71
|
+
return BaseBlock._normalize_columns(v)
|
72
|
+
|
73
|
+
@staticmethod
|
74
|
+
def _normalize_columns(
|
75
|
+
cols: Optional[Union[str, list[str], dict[str, Any]]],
|
76
|
+
) -> Union[list[str], dict[str, Any]]:
|
77
|
+
"""Normalize column inputs into a standard internal format.
|
78
|
+
|
79
|
+
Parameters
|
80
|
+
----------
|
81
|
+
cols : str, list, dict, or None
|
82
|
+
Raw column specification provided by the user.
|
83
|
+
|
84
|
+
Returns
|
85
|
+
-------
|
86
|
+
Union[List[str], Dict[str, Any]]
|
87
|
+
Cleaned and deep-copied column specification.
|
88
|
+
|
89
|
+
Raises
|
90
|
+
------
|
91
|
+
ValueError
|
92
|
+
If the column format is unsupported.
|
93
|
+
"""
|
94
|
+
if cols is None:
|
95
|
+
return []
|
96
|
+
if isinstance(cols, str):
|
97
|
+
return [cols]
|
98
|
+
if isinstance(cols, list):
|
99
|
+
return cols.copy()
|
100
|
+
if isinstance(cols, dict):
|
101
|
+
return dict(cols)
|
102
|
+
raise ValueError(f"Invalid column specification: {cols} (type: {type(cols)})")
|
103
|
+
|
104
|
+
def _validate_columns(self, dataset: Dataset) -> None:
|
105
|
+
"""Check that all required input columns are present in the dataset.
|
106
|
+
|
107
|
+
Parameters
|
108
|
+
----------
|
109
|
+
dataset : Dataset
|
110
|
+
HuggingFace dataset to validate against.
|
111
|
+
|
112
|
+
Raises
|
113
|
+
------
|
114
|
+
MissingColumnError
|
115
|
+
If any expected input column is missing.
|
116
|
+
"""
|
117
|
+
if not self.input_cols:
|
118
|
+
return
|
119
|
+
columns_to_check = (
|
120
|
+
list(self.input_cols.keys())
|
121
|
+
if isinstance(self.input_cols, dict)
|
122
|
+
else self.input_cols
|
123
|
+
)
|
124
|
+
missing_columns = [
|
125
|
+
col for col in columns_to_check if col not in dataset.column_names
|
126
|
+
]
|
127
|
+
if missing_columns:
|
128
|
+
raise MissingColumnError(
|
129
|
+
block_name=self.block_name,
|
130
|
+
missing_columns=missing_columns,
|
131
|
+
available_columns=dataset.column_names,
|
132
|
+
)
|
133
|
+
|
134
|
+
def _validate_output_columns(self, dataset: Dataset) -> None:
|
135
|
+
"""Check that the output columns will not overwrite existing ones.
|
136
|
+
|
137
|
+
Parameters
|
138
|
+
----------
|
139
|
+
dataset : Dataset
|
140
|
+
HuggingFace dataset to validate.
|
141
|
+
|
142
|
+
Raises
|
143
|
+
------
|
144
|
+
OutputColumnCollisionError
|
145
|
+
If output columns already exist in the dataset.
|
146
|
+
"""
|
147
|
+
if not self.output_cols:
|
148
|
+
return
|
149
|
+
columns_to_check = (
|
150
|
+
list(self.output_cols.keys())
|
151
|
+
if isinstance(self.output_cols, dict)
|
152
|
+
else self.output_cols
|
153
|
+
)
|
154
|
+
collisions = [col for col in columns_to_check if col in dataset.column_names]
|
155
|
+
if collisions:
|
156
|
+
raise OutputColumnCollisionError(
|
157
|
+
block_name=self.block_name,
|
158
|
+
collision_columns=collisions,
|
159
|
+
existing_columns=dataset.column_names,
|
160
|
+
)
|
161
|
+
|
162
|
+
def _validate_dataset_not_empty(self, dataset: Dataset) -> None:
|
163
|
+
"""Raise an error if the dataset is empty.
|
164
|
+
|
165
|
+
Parameters
|
166
|
+
----------
|
167
|
+
dataset : Dataset
|
168
|
+
|
169
|
+
Raises
|
170
|
+
------
|
171
|
+
EmptyDatasetError
|
172
|
+
"""
|
173
|
+
if len(dataset) == 0:
|
174
|
+
raise EmptyDatasetError(block_name=self.block_name)
|
175
|
+
|
176
|
+
def _validate_dataset(self, dataset: Dataset) -> None:
|
177
|
+
"""Perform all default dataset validations."""
|
178
|
+
self._validate_dataset_not_empty(dataset)
|
179
|
+
self._validate_columns(dataset)
|
180
|
+
self._validate_output_columns(dataset)
|
181
|
+
|
182
|
+
def _validate_custom(self, dataset: Dataset) -> None:
|
183
|
+
"""Hook for subclasses to add extra validation logic."""
|
184
|
+
pass
|
185
|
+
|
186
|
+
def _log_input_data(self, dataset: Dataset) -> None:
|
187
|
+
"""Print a summary of the input dataset with Rich formatting."""
|
188
|
+
row_count = len(dataset)
|
189
|
+
columns = dataset.column_names
|
190
|
+
content = Text()
|
191
|
+
content.append("\U0001f4ca Processing Input Data\n", style="bold blue")
|
192
|
+
content.append(f"Block Type: {self.__class__.__name__}\n", style="cyan")
|
193
|
+
content.append(f"Input Rows: {row_count:,}\n", style="bold cyan")
|
194
|
+
content.append(f"Input Columns: {len(columns)}\n", style="cyan")
|
195
|
+
content.append(f"Column Names: {', '.join(columns)}\n", style="white")
|
196
|
+
expected = (
|
197
|
+
(
|
198
|
+
", ".join(self.output_cols.keys())
|
199
|
+
if isinstance(self.output_cols, dict)
|
200
|
+
else ", ".join(self.output_cols)
|
201
|
+
)
|
202
|
+
if self.output_cols
|
203
|
+
else "None specified"
|
204
|
+
)
|
205
|
+
content.append(f"Expected Output Columns: {expected}", style="green")
|
206
|
+
console.print(
|
207
|
+
Panel(content, title=f"[bold]{self.block_name}[/bold]", border_style="blue")
|
208
|
+
)
|
209
|
+
|
210
|
+
def _log_output_data(self, input_dataset: Dataset, output_dataset: Dataset) -> None:
|
211
|
+
"""Print a Rich panel summarizing output dataset differences."""
|
212
|
+
in_rows, out_rows = len(input_dataset), len(output_dataset)
|
213
|
+
in_cols, out_cols = (
|
214
|
+
set(input_dataset.column_names),
|
215
|
+
set(output_dataset.column_names),
|
216
|
+
)
|
217
|
+
added_cols, removed_cols = out_cols - in_cols, in_cols - out_cols
|
218
|
+
content = Text()
|
219
|
+
content.append("\u2705 Processing Complete\n", style="bold green")
|
220
|
+
content.append(f"Rows: {in_rows:,} → {out_rows:,}\n", style="cyan")
|
221
|
+
content.append(f"Columns: {len(in_cols)} → {len(out_cols)}\n", style="cyan")
|
222
|
+
if added_cols:
|
223
|
+
content.append(
|
224
|
+
f"\U0001f7e2 Added: {', '.join(sorted(added_cols))}\n", style="green"
|
225
|
+
)
|
226
|
+
if removed_cols:
|
227
|
+
content.append(
|
228
|
+
f"\U0001f534 Removed: {', '.join(sorted(removed_cols))}\n", style="red"
|
229
|
+
)
|
230
|
+
content.append(
|
231
|
+
f"\U0001f4cb Final Columns: {', '.join(sorted(out_cols))}", style="white"
|
232
|
+
)
|
233
|
+
console.print(
|
234
|
+
Panel(
|
235
|
+
content,
|
236
|
+
title=f"[bold green]{self.block_name} - Complete[/bold green]",
|
237
|
+
border_style="green",
|
238
|
+
)
|
239
|
+
)
|
240
|
+
|
241
|
+
@abstractmethod
|
242
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
243
|
+
"""Subclass method to implement data generation logic.
|
244
|
+
|
245
|
+
Parameters
|
246
|
+
----------
|
247
|
+
samples : Dataset
|
248
|
+
Input dataset to process.
|
249
|
+
|
250
|
+
Returns
|
251
|
+
-------
|
252
|
+
Dataset
|
253
|
+
Transformed dataset with new columns or values.
|
254
|
+
"""
|
255
|
+
pass
|
256
|
+
|
257
|
+
def __call__(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
258
|
+
"""Run the block on a dataset with full validation and logging.
|
259
|
+
|
260
|
+
Parameters
|
261
|
+
----------
|
262
|
+
samples : Dataset
|
263
|
+
Input dataset.
|
264
|
+
|
265
|
+
Returns
|
266
|
+
-------
|
267
|
+
Dataset
|
268
|
+
Output dataset after block processing.
|
269
|
+
"""
|
270
|
+
self._log_input_data(samples)
|
271
|
+
self._validate_dataset(samples)
|
272
|
+
self._validate_custom(samples)
|
273
|
+
output_dataset = self.generate(samples, **kwargs)
|
274
|
+
self._log_output_data(samples, output_dataset)
|
275
|
+
return output_dataset
|
276
|
+
|
277
|
+
def __repr__(self) -> str:
|
278
|
+
"""Compact string representation."""
|
279
|
+
return f"{self.__class__.__name__}(name='{self.block_name}', input_cols={self.input_cols}, output_cols={self.output_cols})"
|
280
|
+
|
281
|
+
def get_config(self) -> dict[str, Any]:
|
282
|
+
"""Return only constructor arguments for serialization.
|
283
|
+
|
284
|
+
Returns
|
285
|
+
-------
|
286
|
+
Dict[str, Any]
|
287
|
+
"""
|
288
|
+
return self.model_dump()
|
289
|
+
|
290
|
+
@classmethod
|
291
|
+
def from_config(cls, config: dict[str, Any]) -> "BaseBlock":
|
292
|
+
"""Instantiate block from serialized config.
|
293
|
+
|
294
|
+
Parameters
|
295
|
+
----------
|
296
|
+
config : Dict[str, Any]
|
297
|
+
|
298
|
+
Returns
|
299
|
+
-------
|
300
|
+
BaseBlock
|
301
|
+
"""
|
302
|
+
return cls(**config)
|
303
|
+
|
304
|
+
def get_info(self) -> dict[str, Any]:
|
305
|
+
"""Return a high-level summary of block metadata and config.
|
306
|
+
|
307
|
+
Returns
|
308
|
+
-------
|
309
|
+
Dict[str, Any]
|
310
|
+
"""
|
311
|
+
config = self.get_config()
|
312
|
+
config["block_type"] = self.__class__.__name__
|
313
|
+
return config
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Deprecated blocks for backwards compatibility.
|
3
|
+
|
4
|
+
This module contains deprecated block implementations that are maintained
|
5
|
+
for backwards compatibility. These blocks should not be used in new code.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Local
|
9
|
+
from .combine_columns import CombineColumnsBlock
|
10
|
+
from .duplicate_columns import DuplicateColumns
|
11
|
+
from .filter_by_value import FilterByValueBlock
|
12
|
+
from .flatten_columns import FlattenColumnsBlock
|
13
|
+
from .llmblock import LLMBlock
|
14
|
+
from .rename_columns import RenameColumns
|
15
|
+
from .sample_populator import SamplePopulatorBlock
|
16
|
+
from .selector import SelectorBlock
|
17
|
+
from .set_to_majority_value import SetToMajorityValue
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
"CombineColumnsBlock",
|
21
|
+
"DuplicateColumns",
|
22
|
+
"FilterByValueBlock",
|
23
|
+
"FlattenColumnsBlock",
|
24
|
+
"LLMBlock",
|
25
|
+
"RenameColumns",
|
26
|
+
"SamplePopulatorBlock",
|
27
|
+
"SelectorBlock",
|
28
|
+
"SetToMajorityValue",
|
29
|
+
]
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""DEPRECATED: CombineColumnsBlock for backward compatibility.
|
3
|
+
|
4
|
+
This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
|
5
|
+
Use transform.CombineColumnsBlock instead.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
import warnings
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
from ..transform.text_concat import TextConcatBlock
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"CombineColumnsBlock",
|
26
|
+
"deprecated",
|
27
|
+
"DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
|
28
|
+
)
|
29
|
+
class CombineColumnsBlock(BaseBlock):
|
30
|
+
r"""DEPRECATED: Combine multiple columns into a single column using a separator.
|
31
|
+
|
32
|
+
.. deprecated::
|
33
|
+
Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
|
34
|
+
This class will be removed in a future version.
|
35
|
+
|
36
|
+
This block concatenates values from multiple columns into a single output column,
|
37
|
+
using a specified separator between values.
|
38
|
+
|
39
|
+
Parameters
|
40
|
+
----------
|
41
|
+
block_name : str
|
42
|
+
Name of the block.
|
43
|
+
columns : List[str]
|
44
|
+
List of column names to combine.
|
45
|
+
output_col : str
|
46
|
+
Name of the column to store combined values.
|
47
|
+
separator : str, optional
|
48
|
+
String to use as separator between combined values, by default "\\n\\n".
|
49
|
+
**batch_kwargs : Dict[str, Any]
|
50
|
+
Additional keyword arguments for batch processing.
|
51
|
+
"""
|
52
|
+
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
block_name: str,
|
56
|
+
columns: list[str],
|
57
|
+
output_col: str,
|
58
|
+
separator: str = "\n\n",
|
59
|
+
**batch_kwargs: dict[str, Any],
|
60
|
+
) -> None:
|
61
|
+
warnings.warn(
|
62
|
+
"CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
|
63
|
+
DeprecationWarning,
|
64
|
+
stacklevel=2,
|
65
|
+
)
|
66
|
+
|
67
|
+
# Initialize with dummy values for BaseBlock validation
|
68
|
+
super().__init__(
|
69
|
+
block_name=block_name, input_cols=columns, output_cols=[output_col]
|
70
|
+
)
|
71
|
+
|
72
|
+
# Create the new implementation
|
73
|
+
self._impl = TextConcatBlock(
|
74
|
+
block_name=block_name,
|
75
|
+
input_cols=columns,
|
76
|
+
output_cols=[output_col],
|
77
|
+
separator=separator,
|
78
|
+
)
|
79
|
+
|
80
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
81
|
+
"""Generate a dataset with combined columns.
|
82
|
+
|
83
|
+
Parameters
|
84
|
+
----------
|
85
|
+
samples : Dataset
|
86
|
+
Input dataset to process.
|
87
|
+
|
88
|
+
Returns
|
89
|
+
-------
|
90
|
+
Dataset
|
91
|
+
Dataset with combined values stored in output column.
|
92
|
+
"""
|
93
|
+
return self._impl.generate(samples)
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Deprecated DuplicateColumns for backwards compatibility.
|
3
|
+
|
4
|
+
This module provides a deprecated wrapper around DuplicateColumnsBlock
|
5
|
+
to maintain backwards compatibility with existing code and configurations.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard
|
9
|
+
from typing import Any
|
10
|
+
import warnings
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
|
15
|
+
# Local
|
16
|
+
from ...utils.logger_config import setup_logger
|
17
|
+
from ..base import BaseBlock
|
18
|
+
from ..registry import BlockRegistry
|
19
|
+
from ..transform import DuplicateColumnsBlock
|
20
|
+
|
21
|
+
logger = setup_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@BlockRegistry.register(
|
25
|
+
"DuplicateColumns",
|
26
|
+
"deprecated",
|
27
|
+
"DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
|
28
|
+
)
|
29
|
+
class DuplicateColumns(BaseBlock):
|
30
|
+
"""DEPRECATED: Block for duplicating existing columns with new names.
|
31
|
+
|
32
|
+
This block is deprecated and maintained only for backwards compatibility.
|
33
|
+
Please use DuplicateColumnsBlock instead.
|
34
|
+
|
35
|
+
This block creates copies of existing columns with new names as specified
|
36
|
+
in the columns mapping dictionary.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
block_name: str,
|
42
|
+
columns_map: dict[str, str],
|
43
|
+
) -> None:
|
44
|
+
"""Initialize the deprecated DuplicateColumns.
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
block_name : str
|
49
|
+
Name of the block.
|
50
|
+
columns_map : Dict[str, str]
|
51
|
+
Dictionary mapping existing column names to new column names.
|
52
|
+
Keys are existing column names, values are new column names.
|
53
|
+
"""
|
54
|
+
# Issue deprecation warning
|
55
|
+
warnings.warn(
|
56
|
+
"DuplicateColumns is deprecated and will be removed in a future version. "
|
57
|
+
"Please use DuplicateColumnsBlock instead.",
|
58
|
+
DeprecationWarning,
|
59
|
+
stacklevel=2,
|
60
|
+
)
|
61
|
+
|
62
|
+
# Map old signature to new signature
|
63
|
+
super().__init__(
|
64
|
+
block_name=block_name,
|
65
|
+
input_cols=columns_map,
|
66
|
+
output_cols=list(columns_map.values()),
|
67
|
+
)
|
68
|
+
|
69
|
+
# Create the new block instance with mapped parameters
|
70
|
+
self._new_block = DuplicateColumnsBlock(
|
71
|
+
block_name=block_name,
|
72
|
+
input_cols=columns_map,
|
73
|
+
)
|
74
|
+
|
75
|
+
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
76
|
+
"""Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
|
77
|
+
|
78
|
+
Parameters
|
79
|
+
----------
|
80
|
+
samples : Dataset
|
81
|
+
The input dataset to duplicate columns from.
|
82
|
+
|
83
|
+
Returns
|
84
|
+
-------
|
85
|
+
Dataset
|
86
|
+
The dataset with additional duplicated columns.
|
87
|
+
"""
|
88
|
+
return self._new_block.generate(samples, **kwargs)
|