sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -74
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.3.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.3.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
sdg_hub/__init__.py CHANGED
@@ -1,3 +1,30 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
+ """SDG Hub - Synthetic Data Generation Framework."""
3
+
2
4
  # Local
3
- from .sdg import SDG
5
+ # Local
6
+ from .core import (
7
+ BaseBlock,
8
+ BlockRegistry,
9
+ Flow,
10
+ FlowMetadata,
11
+ FlowParameter,
12
+ FlowRegistry,
13
+ FlowValidator,
14
+ GenerateError,
15
+ resolve_path,
16
+ )
17
+
18
+ __all__ = [
19
+ # Core framework classes (top-level access)
20
+ "BaseBlock",
21
+ "BlockRegistry",
22
+ "Flow",
23
+ "FlowRegistry",
24
+ # Metadata and utilities
25
+ "FlowMetadata",
26
+ "FlowParameter",
27
+ "FlowValidator",
28
+ "GenerateError",
29
+ "resolve_path",
30
+ ]
sdg_hub/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.3'
21
- __version_tuple__ = version_tuple = (0, 1, 3)
20
+ __version__ = version = '0.2.0'
21
+ __version_tuple__ = version_tuple = (0, 2, 0)
@@ -0,0 +1,22 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Core SDG Hub components."""
3
+
4
+ # Local
5
+ from .blocks import BaseBlock, BlockRegistry
6
+ from .flow import Flow, FlowMetadata, FlowParameter, FlowRegistry, FlowValidator
7
+ from .utils import GenerateError, resolve_path
8
+
9
+ __all__ = [
10
+ # Block components
11
+ "BaseBlock",
12
+ "BlockRegistry",
13
+ # Flow components
14
+ "Flow",
15
+ "FlowRegistry",
16
+ "FlowMetadata",
17
+ "FlowParameter",
18
+ "FlowValidator",
19
+ # Utils
20
+ "GenerateError",
21
+ "resolve_path",
22
+ ]
@@ -0,0 +1,58 @@
1
+ """Block implementations for SDG Hub.
2
+
3
+ This package provides various block implementations for data generation, processing, and transformation.
4
+ """
5
+
6
+ # Local
7
+ from .base import BaseBlock
8
+ from .deprecated_blocks import (
9
+ CombineColumnsBlock,
10
+ DuplicateColumns,
11
+ FilterByValueBlock,
12
+ FlattenColumnsBlock,
13
+ LLMBlock,
14
+ RenameColumns,
15
+ SamplePopulatorBlock,
16
+ SelectorBlock,
17
+ SetToMajorityValue,
18
+ )
19
+ from .evaluation import EvaluateFaithfulnessBlock, EvaluateRelevancyBlock
20
+ from .filtering import ColumnValueFilterBlock
21
+ from .llm import LLMChatBlock, PromptBuilderBlock, TextParserBlock
22
+ from .registry import BlockRegistry
23
+ from .transform import (
24
+ DuplicateColumnsBlock,
25
+ IndexBasedMapperBlock,
26
+ MeltColumnsBlock,
27
+ RenameColumnsBlock,
28
+ TextConcatBlock,
29
+ UniformColumnValueSetter,
30
+ )
31
+
32
+ # All blocks moved to deprecated_blocks or transform modules
33
+
34
+ __all__ = [
35
+ "BaseBlock",
36
+ "BlockRegistry",
37
+ "ColumnValueFilterBlock",
38
+ "DuplicateColumnsBlock",
39
+ "IndexBasedMapperBlock",
40
+ "MeltColumnsBlock",
41
+ "RenameColumnsBlock",
42
+ "TextConcatBlock",
43
+ "UniformColumnValueSetter",
44
+ "CombineColumnsBlock", # Deprecated
45
+ "DuplicateColumns", # Deprecated
46
+ "FilterByValueBlock", # Deprecated
47
+ "FlattenColumnsBlock", # Deprecated
48
+ "RenameColumns", # Deprecated
49
+ "SamplePopulatorBlock", # Deprecated
50
+ "SelectorBlock", # Deprecated
51
+ "SetToMajorityValue", # Deprecated
52
+ "LLMBlock", # Deprecated
53
+ "LLMChatBlock",
54
+ "TextParserBlock",
55
+ "PromptBuilderBlock",
56
+ "EvaluateFaithfulnessBlock",
57
+ "EvaluateRelevancyBlock",
58
+ ]
@@ -0,0 +1,313 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Enhanced base block implementation with standardized patterns.
3
+
4
+ This module provides a comprehensive base class for all blocks in the system,
5
+ with unified constructor patterns, column handling, and common functionality.
6
+ """
7
+
8
+ # Standard
9
+ from abc import ABC, abstractmethod
10
+ from typing import Any, Optional, Union
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
15
+ from rich.console import Console
16
+ from rich.panel import Panel
17
+ from rich.text import Text
18
+
19
+ # Local
20
+ from ..utils.error_handling import (
21
+ EmptyDatasetError,
22
+ MissingColumnError,
23
+ OutputColumnCollisionError,
24
+ )
25
+ from ..utils.logger_config import setup_logger
26
+
27
+ logger = setup_logger(__name__)
28
+ console = Console()
29
+
30
+
31
+ class BaseBlock(BaseModel, ABC):
32
+ """Base class for all blocks, with standardized patterns and full Pydantic compatibility.
33
+
34
+ This class defines a unified, configurable base for building composable data processing blocks
35
+ that operate over HuggingFace Datasets. It supports field-based initialization, validation,
36
+ and rich logging for inputs and outputs.
37
+
38
+ Attributes
39
+ ----------
40
+ block_name : str
41
+ Unique identifier for this block instance.
42
+ input_cols : Union[List[str], Dict[str, Any]]
43
+ Input columns from the dataset (string, list of strings, or mapping).
44
+ output_cols : Union[List[str], Dict[str, Any]]
45
+ Output columns to write to the dataset (string, list of strings, or mapping).
46
+ """
47
+
48
+ block_name: str = Field(
49
+ ..., description="Unique identifier for this block instance"
50
+ )
51
+ input_cols: Union[str, list[str], dict[str, Any], None] = Field(
52
+ None, description="Input columns: str, list, or dict"
53
+ )
54
+ output_cols: Union[str, list[str], dict[str, Any], None] = Field(
55
+ None, description="Output columns: str, list, or dict"
56
+ )
57
+
58
+ # Allow extra config fields and complex types like Dataset
59
+ model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
60
+
61
+ # Normalize input columns before model construction
62
+ @field_validator("input_cols", mode="before")
63
+ @classmethod
64
+ def normalize_input_cols(cls, v):
65
+ return BaseBlock._normalize_columns(v)
66
+
67
+ # Normalize output columns before model construction
68
+ @field_validator("output_cols", mode="before")
69
+ @classmethod
70
+ def normalize_output_cols(cls, v):
71
+ return BaseBlock._normalize_columns(v)
72
+
73
+ @staticmethod
74
+ def _normalize_columns(
75
+ cols: Optional[Union[str, list[str], dict[str, Any]]],
76
+ ) -> Union[list[str], dict[str, Any]]:
77
+ """Normalize column inputs into a standard internal format.
78
+
79
+ Parameters
80
+ ----------
81
+ cols : str, list, dict, or None
82
+ Raw column specification provided by the user.
83
+
84
+ Returns
85
+ -------
86
+ Union[List[str], Dict[str, Any]]
87
+ Cleaned and deep-copied column specification.
88
+
89
+ Raises
90
+ ------
91
+ ValueError
92
+ If the column format is unsupported.
93
+ """
94
+ if cols is None:
95
+ return []
96
+ if isinstance(cols, str):
97
+ return [cols]
98
+ if isinstance(cols, list):
99
+ return cols.copy()
100
+ if isinstance(cols, dict):
101
+ return dict(cols)
102
+ raise ValueError(f"Invalid column specification: {cols} (type: {type(cols)})")
103
+
104
+ def _validate_columns(self, dataset: Dataset) -> None:
105
+ """Check that all required input columns are present in the dataset.
106
+
107
+ Parameters
108
+ ----------
109
+ dataset : Dataset
110
+ HuggingFace dataset to validate against.
111
+
112
+ Raises
113
+ ------
114
+ MissingColumnError
115
+ If any expected input column is missing.
116
+ """
117
+ if not self.input_cols:
118
+ return
119
+ columns_to_check = (
120
+ list(self.input_cols.keys())
121
+ if isinstance(self.input_cols, dict)
122
+ else self.input_cols
123
+ )
124
+ missing_columns = [
125
+ col for col in columns_to_check if col not in dataset.column_names
126
+ ]
127
+ if missing_columns:
128
+ raise MissingColumnError(
129
+ block_name=self.block_name,
130
+ missing_columns=missing_columns,
131
+ available_columns=dataset.column_names,
132
+ )
133
+
134
+ def _validate_output_columns(self, dataset: Dataset) -> None:
135
+ """Check that the output columns will not overwrite existing ones.
136
+
137
+ Parameters
138
+ ----------
139
+ dataset : Dataset
140
+ HuggingFace dataset to validate.
141
+
142
+ Raises
143
+ ------
144
+ OutputColumnCollisionError
145
+ If output columns already exist in the dataset.
146
+ """
147
+ if not self.output_cols:
148
+ return
149
+ columns_to_check = (
150
+ list(self.output_cols.keys())
151
+ if isinstance(self.output_cols, dict)
152
+ else self.output_cols
153
+ )
154
+ collisions = [col for col in columns_to_check if col in dataset.column_names]
155
+ if collisions:
156
+ raise OutputColumnCollisionError(
157
+ block_name=self.block_name,
158
+ collision_columns=collisions,
159
+ existing_columns=dataset.column_names,
160
+ )
161
+
162
+ def _validate_dataset_not_empty(self, dataset: Dataset) -> None:
163
+ """Raise an error if the dataset is empty.
164
+
165
+ Parameters
166
+ ----------
167
+ dataset : Dataset
168
+
169
+ Raises
170
+ ------
171
+ EmptyDatasetError
172
+ """
173
+ if len(dataset) == 0:
174
+ raise EmptyDatasetError(block_name=self.block_name)
175
+
176
+ def _validate_dataset(self, dataset: Dataset) -> None:
177
+ """Perform all default dataset validations."""
178
+ self._validate_dataset_not_empty(dataset)
179
+ self._validate_columns(dataset)
180
+ self._validate_output_columns(dataset)
181
+
182
+ def _validate_custom(self, dataset: Dataset) -> None:
183
+ """Hook for subclasses to add extra validation logic."""
184
+ pass
185
+
186
+ def _log_input_data(self, dataset: Dataset) -> None:
187
+ """Print a summary of the input dataset with Rich formatting."""
188
+ row_count = len(dataset)
189
+ columns = dataset.column_names
190
+ content = Text()
191
+ content.append("\U0001f4ca Processing Input Data\n", style="bold blue")
192
+ content.append(f"Block Type: {self.__class__.__name__}\n", style="cyan")
193
+ content.append(f"Input Rows: {row_count:,}\n", style="bold cyan")
194
+ content.append(f"Input Columns: {len(columns)}\n", style="cyan")
195
+ content.append(f"Column Names: {', '.join(columns)}\n", style="white")
196
+ expected = (
197
+ (
198
+ ", ".join(self.output_cols.keys())
199
+ if isinstance(self.output_cols, dict)
200
+ else ", ".join(self.output_cols)
201
+ )
202
+ if self.output_cols
203
+ else "None specified"
204
+ )
205
+ content.append(f"Expected Output Columns: {expected}", style="green")
206
+ console.print(
207
+ Panel(content, title=f"[bold]{self.block_name}[/bold]", border_style="blue")
208
+ )
209
+
210
+ def _log_output_data(self, input_dataset: Dataset, output_dataset: Dataset) -> None:
211
+ """Print a Rich panel summarizing output dataset differences."""
212
+ in_rows, out_rows = len(input_dataset), len(output_dataset)
213
+ in_cols, out_cols = (
214
+ set(input_dataset.column_names),
215
+ set(output_dataset.column_names),
216
+ )
217
+ added_cols, removed_cols = out_cols - in_cols, in_cols - out_cols
218
+ content = Text()
219
+ content.append("\u2705 Processing Complete\n", style="bold green")
220
+ content.append(f"Rows: {in_rows:,} → {out_rows:,}\n", style="cyan")
221
+ content.append(f"Columns: {len(in_cols)} → {len(out_cols)}\n", style="cyan")
222
+ if added_cols:
223
+ content.append(
224
+ f"\U0001f7e2 Added: {', '.join(sorted(added_cols))}\n", style="green"
225
+ )
226
+ if removed_cols:
227
+ content.append(
228
+ f"\U0001f534 Removed: {', '.join(sorted(removed_cols))}\n", style="red"
229
+ )
230
+ content.append(
231
+ f"\U0001f4cb Final Columns: {', '.join(sorted(out_cols))}", style="white"
232
+ )
233
+ console.print(
234
+ Panel(
235
+ content,
236
+ title=f"[bold green]{self.block_name} - Complete[/bold green]",
237
+ border_style="green",
238
+ )
239
+ )
240
+
241
+ @abstractmethod
242
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
243
+ """Subclass method to implement data generation logic.
244
+
245
+ Parameters
246
+ ----------
247
+ samples : Dataset
248
+ Input dataset to process.
249
+
250
+ Returns
251
+ -------
252
+ Dataset
253
+ Transformed dataset with new columns or values.
254
+ """
255
+ pass
256
+
257
+ def __call__(self, samples: Dataset, **kwargs: Any) -> Dataset:
258
+ """Run the block on a dataset with full validation and logging.
259
+
260
+ Parameters
261
+ ----------
262
+ samples : Dataset
263
+ Input dataset.
264
+
265
+ Returns
266
+ -------
267
+ Dataset
268
+ Output dataset after block processing.
269
+ """
270
+ self._log_input_data(samples)
271
+ self._validate_dataset(samples)
272
+ self._validate_custom(samples)
273
+ output_dataset = self.generate(samples, **kwargs)
274
+ self._log_output_data(samples, output_dataset)
275
+ return output_dataset
276
+
277
+ def __repr__(self) -> str:
278
+ """Compact string representation."""
279
+ return f"{self.__class__.__name__}(name='{self.block_name}', input_cols={self.input_cols}, output_cols={self.output_cols})"
280
+
281
+ def get_config(self) -> dict[str, Any]:
282
+ """Return only constructor arguments for serialization.
283
+
284
+ Returns
285
+ -------
286
+ Dict[str, Any]
287
+ """
288
+ return self.model_dump()
289
+
290
+ @classmethod
291
+ def from_config(cls, config: dict[str, Any]) -> "BaseBlock":
292
+ """Instantiate block from serialized config.
293
+
294
+ Parameters
295
+ ----------
296
+ config : Dict[str, Any]
297
+
298
+ Returns
299
+ -------
300
+ BaseBlock
301
+ """
302
+ return cls(**config)
303
+
304
+ def get_info(self) -> dict[str, Any]:
305
+ """Return a high-level summary of block metadata and config.
306
+
307
+ Returns
308
+ -------
309
+ Dict[str, Any]
310
+ """
311
+ config = self.get_config()
312
+ config["block_type"] = self.__class__.__name__
313
+ return config
@@ -0,0 +1,29 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Deprecated blocks for backwards compatibility.
3
+
4
+ This module contains deprecated block implementations that are maintained
5
+ for backwards compatibility. These blocks should not be used in new code.
6
+ """
7
+
8
+ # Local
9
+ from .combine_columns import CombineColumnsBlock
10
+ from .duplicate_columns import DuplicateColumns
11
+ from .filter_by_value import FilterByValueBlock
12
+ from .flatten_columns import FlattenColumnsBlock
13
+ from .llmblock import LLMBlock
14
+ from .rename_columns import RenameColumns
15
+ from .sample_populator import SamplePopulatorBlock
16
+ from .selector import SelectorBlock
17
+ from .set_to_majority_value import SetToMajorityValue
18
+
19
+ __all__ = [
20
+ "CombineColumnsBlock",
21
+ "DuplicateColumns",
22
+ "FilterByValueBlock",
23
+ "FlattenColumnsBlock",
24
+ "LLMBlock",
25
+ "RenameColumns",
26
+ "SamplePopulatorBlock",
27
+ "SelectorBlock",
28
+ "SetToMajorityValue",
29
+ ]
@@ -0,0 +1,93 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """DEPRECATED: CombineColumnsBlock for backward compatibility.
3
+
4
+ This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
5
+ Use transform.CombineColumnsBlock instead.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+ import warnings
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+ from ..transform.text_concat import TextConcatBlock
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ @BlockRegistry.register(
25
+ "CombineColumnsBlock",
26
+ "deprecated",
27
+ "DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
28
+ )
29
+ class CombineColumnsBlock(BaseBlock):
30
+ r"""DEPRECATED: Combine multiple columns into a single column using a separator.
31
+
32
+ .. deprecated::
33
+ Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
34
+ This class will be removed in a future version.
35
+
36
+ This block concatenates values from multiple columns into a single output column,
37
+ using a specified separator between values.
38
+
39
+ Parameters
40
+ ----------
41
+ block_name : str
42
+ Name of the block.
43
+ columns : List[str]
44
+ List of column names to combine.
45
+ output_col : str
46
+ Name of the column to store combined values.
47
+ separator : str, optional
48
+ String to use as separator between combined values, by default "\\n\\n".
49
+ **batch_kwargs : Dict[str, Any]
50
+ Additional keyword arguments for batch processing.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ block_name: str,
56
+ columns: list[str],
57
+ output_col: str,
58
+ separator: str = "\n\n",
59
+ **batch_kwargs: dict[str, Any],
60
+ ) -> None:
61
+ warnings.warn(
62
+ "CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
63
+ DeprecationWarning,
64
+ stacklevel=2,
65
+ )
66
+
67
+ # Initialize with dummy values for BaseBlock validation
68
+ super().__init__(
69
+ block_name=block_name, input_cols=columns, output_cols=[output_col]
70
+ )
71
+
72
+ # Create the new implementation
73
+ self._impl = TextConcatBlock(
74
+ block_name=block_name,
75
+ input_cols=columns,
76
+ output_cols=[output_col],
77
+ separator=separator,
78
+ )
79
+
80
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
81
+ """Generate a dataset with combined columns.
82
+
83
+ Parameters
84
+ ----------
85
+ samples : Dataset
86
+ Input dataset to process.
87
+
88
+ Returns
89
+ -------
90
+ Dataset
91
+ Dataset with combined values stored in output column.
92
+ """
93
+ return self._impl.generate(samples)
@@ -0,0 +1,88 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Deprecated DuplicateColumns for backwards compatibility.
3
+
4
+ This module provides a deprecated wrapper around DuplicateColumnsBlock
5
+ to maintain backwards compatibility with existing code and configurations.
6
+ """
7
+
8
+ # Standard
9
+ from typing import Any
10
+ import warnings
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+
15
+ # Local
16
+ from ...utils.logger_config import setup_logger
17
+ from ..base import BaseBlock
18
+ from ..registry import BlockRegistry
19
+ from ..transform import DuplicateColumnsBlock
20
+
21
+ logger = setup_logger(__name__)
22
+
23
+
24
+ @BlockRegistry.register(
25
+ "DuplicateColumns",
26
+ "deprecated",
27
+ "DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
28
+ )
29
+ class DuplicateColumns(BaseBlock):
30
+ """DEPRECATED: Block for duplicating existing columns with new names.
31
+
32
+ This block is deprecated and maintained only for backwards compatibility.
33
+ Please use DuplicateColumnsBlock instead.
34
+
35
+ This block creates copies of existing columns with new names as specified
36
+ in the columns mapping dictionary.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ block_name: str,
42
+ columns_map: dict[str, str],
43
+ ) -> None:
44
+ """Initialize the deprecated DuplicateColumns.
45
+
46
+ Parameters
47
+ ----------
48
+ block_name : str
49
+ Name of the block.
50
+ columns_map : Dict[str, str]
51
+ Dictionary mapping existing column names to new column names.
52
+ Keys are existing column names, values are new column names.
53
+ """
54
+ # Issue deprecation warning
55
+ warnings.warn(
56
+ "DuplicateColumns is deprecated and will be removed in a future version. "
57
+ "Please use DuplicateColumnsBlock instead.",
58
+ DeprecationWarning,
59
+ stacklevel=2,
60
+ )
61
+
62
+ # Map old signature to new signature
63
+ super().__init__(
64
+ block_name=block_name,
65
+ input_cols=columns_map,
66
+ output_cols=list(columns_map.values()),
67
+ )
68
+
69
+ # Create the new block instance with mapped parameters
70
+ self._new_block = DuplicateColumnsBlock(
71
+ block_name=block_name,
72
+ input_cols=columns_map,
73
+ )
74
+
75
+ def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
76
+ """Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
77
+
78
+ Parameters
79
+ ----------
80
+ samples : Dataset
81
+ The input dataset to duplicate columns from.
82
+
83
+ Returns
84
+ -------
85
+ Dataset
86
+ The dataset with additional duplicated columns.
87
+ """
88
+ return self._new_block.generate(samples, **kwargs)