sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,597 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Utility blocks for dataset manipulation and transformation.
3
-
4
- This module provides various utility blocks for operations like column manipulation,
5
- data population, selection, and transformation of datasets.
6
- """
7
-
8
- # Standard
9
- import operator
10
- from typing import Any, Callable, Dict, List, Optional, Type, Union
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from .block import Block
17
- from ..registry import BlockRegistry
18
- from ..logger_config import setup_logger
19
-
20
- logger = setup_logger(__name__)
21
-
22
-
23
- @BlockRegistry.register("FilterByValueBlock")
24
- class FilterByValueBlock(Block):
25
- """A block for filtering datasets based on column values.
26
-
27
- This block allows filtering of datasets using various operations (e.g., equals, contains)
28
- on specified column values, with optional data type conversion
29
- """
30
-
31
- def __init__(
32
- self,
33
- block_name: str,
34
- filter_column: str,
35
- filter_value: Union[Any, List[Any]],
36
- operation: Callable[[Any, Any], bool],
37
- convert_dtype: Optional[Union[Type[float], Type[int]]] = None,
38
- **batch_kwargs: Dict[str, Any],
39
- ) -> None:
40
- """Initialize a new FilterByValueBlock instance.
41
-
42
- Parameters
43
- ----------
44
- block_name : str
45
- Name of the block.
46
- filter_column : str
47
- The name of the column in the dataset to apply the filter on.
48
- filter_value : Union[Any, List[Any]]
49
- The value(s) to filter by.
50
- operation : Callable[[Any, Any], bool]
51
- A binary operator from the operator module (e.g., operator.eq, operator.contains)
52
- that takes two arguments and returns a boolean.
53
- convert_dtype : Optional[Union[Type[float], Type[int]]], optional
54
- Type to convert the filter column to. Can be either float or int.
55
- If None, no conversion is performed.
56
- **batch_kwargs : Dict[str, Any]
57
- Additional kwargs for batch processing.
58
-
59
- Returns
60
- -------
61
- None
62
-
63
- Raises
64
- ------
65
- ValueError
66
- If the operation is not from the operator module.
67
- """
68
- super().__init__(block_name=block_name)
69
- # Validate that operation is from operator module
70
- if operation.__module__ != "_operator":
71
- logger.error("Invalid operation: %s", operation)
72
- raise ValueError("Operation must be from operator module")
73
-
74
- self.value = filter_value if isinstance(filter_value, list) else [filter_value]
75
- self.column_name = filter_column
76
- self.operation = operation
77
- self.convert_dtype = convert_dtype
78
- self.num_procs = batch_kwargs.get("num_procs", 1)
79
-
80
- def _convert_dtype(self, sample: Dict[str, Any]) -> Dict[str, Any]:
81
- """Convert the data type of the filter column.
82
-
83
- Parameters
84
- ----------
85
- sample : Dict[str, Any]
86
- The sample dictionary containing the column to convert.
87
-
88
- Returns
89
- -------
90
- Dict[str, Any]
91
- The sample with converted column value.
92
- """
93
- try:
94
- sample[self.column_name] = self.convert_dtype(sample[self.column_name])
95
- except ValueError as e:
96
- logger.error(
97
- "Error converting dtype: %s, filling with None to be filtered later", e
98
- )
99
- sample[self.column_name] = None
100
- return sample
101
-
102
- def generate(self, samples: Dataset) -> Dataset:
103
- """Generate filtered dataset based on specified conditions.
104
-
105
- Parameters
106
- ----------
107
- samples : Dataset
108
- The input dataset to filter.
109
-
110
- Returns
111
- -------
112
- Dataset
113
- The filtered dataset.
114
- """
115
- if self.convert_dtype:
116
- samples = samples.map(
117
- self._convert_dtype,
118
- num_proc=self.num_procs,
119
- )
120
-
121
- if self.operation == operator.contains:
122
- samples = samples.filter(
123
- lambda x: self.operation(self.value, x[self.column_name]),
124
- num_proc=self.num_procs,
125
- )
126
-
127
- samples = samples.filter(
128
- lambda x: x[self.column_name] is not None,
129
- num_proc=self.num_procs,
130
- )
131
-
132
- samples = samples.filter(
133
- lambda x: any(
134
- self.operation(x[self.column_name], value) for value in self.value
135
- ),
136
- num_proc=self.num_procs,
137
- )
138
-
139
- return samples
140
-
141
-
142
- @BlockRegistry.register("SamplePopulatorBlock")
143
- class SamplePopulatorBlock(Block):
144
- """Block for populating dataset with data from configuration files.
145
-
146
- This block reads data from one or more configuration files and populates a
147
- dataset with the data. The data is stored in a dictionary, with the keys
148
- being the names of the configuration files.
149
-
150
- Parameters
151
- ----------
152
- block_name : str
153
- Name of the block.
154
- config_paths : List[str]
155
- List of paths to configuration files to load.
156
- column_name : str
157
- Name of the column to use as key for populating data.
158
- post_fix : str, optional
159
- Suffix to append to configuration filenames, by default "".
160
- **batch_kwargs : Dict[str, Any]
161
- Additional keyword arguments for batch processing.
162
- """
163
-
164
- def __init__(
165
- self,
166
- block_name: str,
167
- config_paths: List[str],
168
- column_name: str,
169
- post_fix: str = "",
170
- **batch_kwargs: Dict[str, Any],
171
- ) -> None:
172
- super().__init__(block_name=block_name)
173
- self.configs = {}
174
- for config in config_paths:
175
- if post_fix:
176
- config_name = config.replace(".yaml", f"_{post_fix}.yaml")
177
- else:
178
- config_name = config
179
- config_key = config.split("/")[-1].split(".")[0]
180
- self.configs[config_key] = self._load_config(config_name)
181
- self.column_name = column_name
182
- self.num_procs = batch_kwargs.get("num_procs", 8)
183
-
184
- def _generate(self, sample: Dict[str, Any]) -> Dict[str, Any]:
185
- """Generate a new sample by populating it with configuration data.
186
-
187
- Parameters
188
- ----------
189
- sample : Dict[str, Any]
190
- Input sample to populate with configuration data.
191
-
192
- Returns
193
- -------
194
- Dict[str, Any]
195
- Sample populated with configuration data.
196
- """
197
- sample = {**sample, **self.configs[sample[self.column_name]]}
198
- return sample
199
-
200
- def generate(self, samples: Dataset) -> Dataset:
201
- """Generate a new dataset with populated configuration data.
202
-
203
- Parameters
204
- ----------
205
- samples : Dataset
206
- Input dataset to populate with configuration data.
207
-
208
- Returns
209
- -------
210
- Dataset
211
- Dataset populated with configuration data.
212
- """
213
- samples = samples.map(self._generate, num_proc=self.num_procs)
214
- return samples
215
-
216
-
217
- @BlockRegistry.register("SelectorBlock")
218
- class SelectorBlock(Block):
219
- """Block for selecting and mapping values from one column to another.
220
-
221
- This block uses a mapping dictionary to select values from one column and
222
- store them in a new output column based on a choice column's value.
223
-
224
- Parameters
225
- ----------
226
- block_name : str
227
- Name of the block.
228
- choice_map : Dict[str, str]
229
- Dictionary mapping choice values to column names.
230
- choice_col : str
231
- Name of the column containing choice values.
232
- output_col : str
233
- Name of the column to store selected values.
234
- **batch_kwargs : Dict[str, Any]
235
- Additional keyword arguments for batch processing.
236
- """
237
-
238
- def __init__(
239
- self,
240
- block_name: str,
241
- choice_map: Dict[str, str],
242
- choice_col: str,
243
- output_col: str,
244
- **batch_kwargs: Dict[str, Any],
245
- ) -> None:
246
- super().__init__(block_name=block_name)
247
- self.choice_map = choice_map
248
- self.choice_col = choice_col
249
- self.output_col = output_col
250
- self.num_procs = batch_kwargs.get("num_procs", 8)
251
-
252
- def _generate(self, sample: Dict[str, Any]) -> Dict[str, Any]:
253
- """Generate a new sample by selecting values based on choice mapping.
254
-
255
- Parameters
256
- ----------
257
- sample : Dict[str, Any]
258
- Input sample to process.
259
-
260
- Returns
261
- -------
262
- Dict[str, Any]
263
- Sample with selected values stored in output column.
264
- """
265
- sample[self.output_col] = sample[self.choice_map[sample[self.choice_col]]]
266
- return sample
267
-
268
- def generate(self, samples: Dataset) -> Dataset:
269
- """Generate a new dataset with selected values.
270
-
271
- Parameters
272
- ----------
273
- samples : Dataset
274
- Input dataset to process.
275
-
276
- Returns
277
- -------
278
- Dataset
279
- Dataset with selected values stored in output column.
280
- """
281
- samples = samples.map(self._generate, num_proc=self.num_procs)
282
- return samples
283
-
284
-
285
- @BlockRegistry.register("CombineColumnsBlock")
286
- class CombineColumnsBlock(Block):
287
- r"""Block for combining multiple columns into a single column.
288
-
289
- This block concatenates values from multiple columns into a single output column,
290
- using a specified separator between values.
291
-
292
- Parameters
293
- ----------
294
- block_name : str
295
- Name of the block.
296
- columns : List[str]
297
- List of column names to combine.
298
- output_col : str
299
- Name of the column to store combined values.
300
- separator : str, optional
301
- String to use as separator between combined values, by default "\n\n".
302
- **batch_kwargs : Dict[str, Any]
303
- Additional keyword arguments for batch processing.
304
- """
305
-
306
- def __init__(
307
- self,
308
- block_name: str,
309
- columns: List[str],
310
- output_col: str,
311
- separator: str = "\n\n",
312
- **batch_kwargs: Dict[str, Any],
313
- ) -> None:
314
- super().__init__(block_name=block_name)
315
- self.columns = columns
316
- self.output_col = output_col
317
- self.separator = separator
318
- self.num_procs = batch_kwargs.get("num_procs", 8)
319
-
320
- def _generate(self, sample: Dict[str, Any]) -> Dict[str, Any]:
321
- """Generate a new sample by combining multiple columns.
322
-
323
- Parameters
324
- ----------
325
- sample : Dict[str, Any]
326
- Input sample to process.
327
-
328
- Returns
329
- -------
330
- Dict[str, Any]
331
- Sample with combined values stored in output column.
332
- """
333
- sample[self.output_col] = self.separator.join(
334
- [str(sample[col]) for col in self.columns]
335
- )
336
- return sample
337
-
338
- def generate(self, samples: Dataset) -> Dataset:
339
- """Generate a new dataset with combined columns.
340
-
341
- Parameters
342
- ----------
343
- samples : Dataset
344
- Input dataset to process.
345
-
346
- Returns
347
- -------
348
- Dataset
349
- Dataset with combined values stored in output column.
350
- """
351
- samples = samples.map(self._generate, num_proc=self.num_procs)
352
- return samples
353
-
354
-
355
- @BlockRegistry.register("FlattenColumnsBlock")
356
- class FlattenColumnsBlock(Block):
357
- """Block for flattening multiple columns into a long format.
358
-
359
- This block transforms a wide dataset format into a long format by melting
360
- specified columns into rows, creating new variable and value columns.
361
-
362
- Parameters
363
- ----------
364
- block_name : str
365
- Name of the block.
366
- var_cols : List[str]
367
- List of column names to be melted into rows.
368
- value_name : str
369
- Name of the new column that will contain the values.
370
- var_name : str
371
- Name of the new column that will contain the variable names.
372
- """
373
-
374
- def __init__(
375
- self,
376
- block_name: str,
377
- var_cols: List[str],
378
- value_name: str,
379
- var_name: str,
380
- ) -> None:
381
- super().__init__(block_name=block_name)
382
- self.var_cols = var_cols
383
- self.value_name = value_name
384
- self.var_name = var_name
385
-
386
- def generate(self, samples: Dataset) -> Dataset:
387
- """Generate a flattened dataset in long format.
388
-
389
- Parameters
390
- ----------
391
- samples : Dataset
392
- Input dataset to flatten.
393
-
394
- Returns
395
- -------
396
- Dataset
397
- Flattened dataset in long format with new variable and value columns.
398
- """
399
- df = samples.to_pandas()
400
- id_cols = [col for col in samples.column_names if col not in self.var_cols]
401
- flatten_df = df.melt(
402
- id_vars=id_cols,
403
- value_vars=self.var_cols,
404
- value_name=self.value_name,
405
- var_name=self.var_name,
406
- )
407
- return Dataset.from_pandas(flatten_df)
408
-
409
-
410
- @BlockRegistry.register("DuplicateColumns")
411
- class DuplicateColumns(Block):
412
- """Block for duplicating existing columns with new names.
413
-
414
- This block creates copies of existing columns with new names as specified
415
- in the columns mapping dictionary.
416
-
417
- Parameters
418
- ----------
419
- block_name : str
420
- Name of the block.
421
- columns_map : Dict[str, str]
422
- Dictionary mapping existing column names to new column names.
423
- Keys are existing column names, values are new column names.
424
- """
425
-
426
- def __init__(
427
- self,
428
- block_name: str,
429
- columns_map: Dict[str, str],
430
- ) -> None:
431
- super().__init__(block_name=block_name)
432
- self.columns_map = columns_map
433
-
434
- def generate(self, samples: Dataset) -> Dataset:
435
- """Generate a dataset with duplicated columns.
436
-
437
- Parameters
438
- ----------
439
- samples : Dataset
440
- Input dataset to duplicate columns from.
441
-
442
- Returns
443
- -------
444
- Dataset
445
- Dataset with additional duplicated columns.
446
- """
447
- for col_to_dup in self.columns_map:
448
- samples = samples.add_column(
449
- self.columns_map[col_to_dup], samples[col_to_dup]
450
- )
451
- return samples
452
-
453
-
454
- @BlockRegistry.register("RenameColumns")
455
- class RenameColumns(Block):
456
- """Block for renaming columns in a dataset.
457
-
458
- This block renames columns in a dataset according to a mapping dictionary,
459
- where keys are existing column names and values are new column names.
460
-
461
- Parameters
462
- ----------
463
- block_name : str
464
- Name of the block.
465
- columns_map : Dict[str, str]
466
- Dictionary mapping existing column names to new column names.
467
- Keys are existing column names, values are new column names.
468
- """
469
-
470
- def __init__(
471
- self,
472
- block_name: str,
473
- columns_map: Dict[str, str],
474
- ) -> None:
475
- super().__init__(block_name=block_name)
476
- self.columns_map = columns_map
477
-
478
- def generate(self, samples: Dataset) -> Dataset:
479
- """Generate a dataset with renamed columns.
480
-
481
- Parameters
482
- ----------
483
- samples : Dataset
484
- Input dataset to rename columns in.
485
-
486
- Returns
487
- -------
488
- Dataset
489
- Dataset with renamed columns.
490
- """
491
- samples = samples.rename_columns(self.columns_map)
492
- return samples
493
-
494
-
495
- @BlockRegistry.register("SetToMajorityValue")
496
- class SetToMajorityValue(Block):
497
- """Block for setting all values in a column to the most frequent value.
498
-
499
- This block finds the most common value (mode) in a specified column and
500
- replaces all values in that column with this majority value.
501
-
502
- Parameters
503
- ----------
504
- block_name : str
505
- Name of the block.
506
- col_name : str
507
- Name of the column to set to majority value.
508
- """
509
-
510
- def __init__(
511
- self,
512
- block_name: str,
513
- col_name: str,
514
- ) -> None:
515
- super().__init__(block_name=block_name)
516
- self.col_name = col_name
517
-
518
- def generate(self, samples: Dataset) -> Dataset:
519
- """Generate a dataset with column set to majority value.
520
-
521
- Parameters
522
- ----------
523
- samples : Dataset
524
- Input dataset to process.
525
-
526
- Returns
527
- -------
528
- Dataset
529
- Dataset with specified column set to its majority value.
530
- """
531
- samples = samples.to_pandas()
532
- samples[self.col_name] = samples[self.col_name].mode()[0]
533
- return Dataset.from_pandas(samples)
534
-
535
-
536
- @BlockRegistry.register("IterBlock")
537
- class IterBlock(Block):
538
- """Block for iteratively applying another block multiple times.
539
-
540
- This block takes another block type and applies it repeatedly to generate
541
- multiple samples from the input dataset.
542
-
543
- Parameters
544
- ----------
545
- block_name : str
546
- Name of the block.
547
- num_iters : int
548
- Number of times to apply the block.
549
- block_type : Type[Block]
550
- The block class to instantiate and apply.
551
- block_kwargs : Dict[str, Any]
552
- Keyword arguments to pass to the block constructor.
553
- **kwargs : Dict[str, Any]
554
- Additional keyword arguments. Supports:
555
- - gen_kwargs: Dict[str, Any]
556
- Arguments to pass to the block's generate method.
557
- """
558
-
559
- def __init__(
560
- self,
561
- block_name: str,
562
- num_iters: int,
563
- block_type: Type[Block],
564
- block_kwargs: Dict[str, Any],
565
- **kwargs: Dict[str, Any],
566
- ) -> None:
567
- super().__init__(block_name)
568
- self.num_iters = num_iters
569
- self.block = block_type(**block_kwargs)
570
- self.gen_kwargs = kwargs.get("gen_kwargs", {})
571
-
572
- def generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> Dataset:
573
- """Generate multiple samples by iteratively applying the block.
574
-
575
- Parameters
576
- ----------
577
- samples : Dataset
578
- Input dataset to process.
579
- **gen_kwargs : Dict[str, Any]
580
- Additional keyword arguments to pass to the block's generate method.
581
- These are merged with the gen_kwargs provided at initialization.
582
-
583
- Returns
584
- -------
585
- Dataset
586
- Dataset containing all generated samples from all iterations.
587
- """
588
- generated_samples = []
589
- num_iters = self.num_iters
590
-
591
- for _ in range(num_iters):
592
- batch_generated = self.block.generate(
593
- samples, **{**self.gen_kwargs, **gen_kwargs}
594
- )
595
- generated_samples.extend(batch_generated)
596
-
597
- return Dataset.from_list(generated_samples)