sdg-hub 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +0 -22
  3. sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  4. sdg_hub/core/flow/base.py +8 -80
  5. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  6. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  7. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  8. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  9. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +5 -1
  10. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
  11. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +14 -25
  12. sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  13. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  14. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  15. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  16. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  17. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  18. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  19. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  20. sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  21. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  22. sdg_hub/core/flow/migration.py +0 -198
  23. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
  24. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
  25. {sdg_hub-0.4.2.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,88 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated RenameColumns for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around RenameColumnsBlock
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform import RenameColumnsBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "RenameColumns",
26
- "deprecated",
27
- "DEPRECATED: Use RenameColumnsBlock instead. Renames columns in a dataset according to a mapping dictionary",
28
- )
29
- class RenameColumns(BaseBlock):
30
- """DEPRECATED: Block for renaming columns in a dataset.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use RenameColumnsBlock instead.
34
-
35
- This block renames columns in a dataset according to a mapping dictionary,
36
- where keys are existing column names and values are new column names.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- columns_map: dict[str, str],
43
- ) -> None:
44
- """Initialize the deprecated RenameColumns.
45
-
46
- Parameters
47
- ----------
48
- block_name : str
49
- Name of the block.
50
- columns_map : Dict[str, str]
51
- Dictionary mapping existing column names to new column names.
52
- Keys are existing column names, values are new column names.
53
- """
54
- # Issue deprecation warning
55
- warnings.warn(
56
- "RenameColumns is deprecated and will be removed in a future version. "
57
- "Please use RenameColumnsBlock instead.",
58
- DeprecationWarning,
59
- stacklevel=2,
60
- )
61
-
62
- # Map old signature to new signature
63
- super().__init__(
64
- block_name=block_name,
65
- input_cols=columns_map,
66
- output_cols=[],
67
- )
68
-
69
- # Create the new block instance with mapped parameters
70
- self._new_block = RenameColumnsBlock(
71
- block_name=block_name,
72
- input_cols=columns_map,
73
- )
74
-
75
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
76
- """Generate dataset with renamed columns using the new RenameColumnsBlock.
77
-
78
- Parameters
79
- ----------
80
- samples : Dataset
81
- The input dataset to rename columns in.
82
-
83
- Returns
84
- -------
85
- Dataset
86
- The dataset with renamed columns.
87
- """
88
- return self._new_block.generate(samples, **kwargs)
@@ -1,58 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """DEPRECATED: SamplePopulatorBlock for backward compatibility.
3
-
4
- This module provides a deprecated stub for SamplePopulatorBlock.
5
- This block is deprecated and will be replaced with a router block.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
-
20
- logger = setup_logger(__name__)
21
-
22
-
23
- @BlockRegistry.register(
24
- "SamplePopulatorBlock",
25
- "deprecated",
26
- "DEPRECATED: Use a router block instead. Populates dataset with data from configuration files",
27
- )
28
- class SamplePopulatorBlock(BaseBlock):
29
- """DEPRECATED: Block for populating dataset with data from configuration files.
30
-
31
- .. deprecated::
32
- This block is deprecated and will be replaced with a router block.
33
- """
34
-
35
- def __init__(
36
- self,
37
- block_name: str,
38
- config_paths: list[str],
39
- column_name: str,
40
- post_fix: str = "",
41
- **batch_kwargs: dict[str, Any],
42
- ) -> None:
43
- warnings.warn(
44
- "SamplePopulatorBlock is deprecated and will be replaced with a router block.",
45
- DeprecationWarning,
46
- stacklevel=2,
47
- )
48
-
49
- # Initialize with dummy values for BaseBlock validation
50
- super().__init__(
51
- block_name=block_name, input_cols=[column_name], output_cols=[column_name]
52
- )
53
-
54
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
55
- """Generate method - raises error as block is deprecated."""
56
- raise NotImplementedError(
57
- "SamplePopulatorBlock is deprecated and will be replaced with a router block."
58
- )
@@ -1,97 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """DEPRECATED: SelectorBlock for backward compatibility.
3
-
4
- This module provides a deprecated wrapper for the old SelectorBlock interface.
5
- Use transform.IndexBasedMapperBlock instead.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform.index_based_mapper import IndexBasedMapperBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "SelectorBlock",
26
- "deprecated",
27
- "DEPRECATED: Use IndexBasedMapperBlock instead. Selects and maps values from one column to another",
28
- )
29
- class SelectorBlock(BaseBlock):
30
- """DEPRECATED: Block for selecting and mapping values from one column to another.
31
-
32
- .. deprecated::
33
- Use `sdg_hub.blocks.transform.IndexBasedMapperBlock` instead.
34
- This class will be removed in a future version.
35
-
36
- This block uses a mapping dictionary to select values from one column and
37
- store them in a new output column based on a choice column's value.
38
-
39
- Parameters
40
- ----------
41
- block_name : str
42
- Name of the block.
43
- choice_map : Dict[str, str]
44
- Dictionary mapping choice values to column names.
45
- choice_col : str
46
- Name of the column containing choice values.
47
- output_col : str
48
- Name of the column to store selected values.
49
- **batch_kwargs : Dict[str, Any]
50
- Additional keyword arguments for batch processing.
51
- """
52
-
53
- def __init__(
54
- self,
55
- block_name: str,
56
- choice_map: dict[str, str],
57
- choice_col: str,
58
- output_col: str,
59
- **batch_kwargs: dict[str, Any],
60
- ) -> None:
61
- warnings.warn(
62
- "SelectorBlock is deprecated. Use sdg_hub.blocks.transform.IndexBasedMapperBlock instead.",
63
- DeprecationWarning,
64
- stacklevel=2,
65
- )
66
-
67
- # Initialize with dummy values for BaseBlock validation
68
- # We need all columns referenced in choice_map as input, plus the choice column
69
- all_input_cols = list(choice_map.values()) + [choice_col]
70
-
71
- super().__init__(
72
- block_name=block_name, input_cols=all_input_cols, output_cols=[output_col]
73
- )
74
-
75
- # Create the new implementation
76
- self._impl = IndexBasedMapperBlock(
77
- block_name=block_name,
78
- input_cols=all_input_cols,
79
- output_cols=[output_col],
80
- choice_map=choice_map,
81
- choice_cols=[choice_col],
82
- )
83
-
84
- def generate(self, samples: Dataset, **kwargs) -> Dataset:
85
- """Generate a new dataset with selected values.
86
-
87
- Parameters
88
- ----------
89
- samples : Dataset
90
- Input dataset to process.
91
-
92
- Returns
93
- -------
94
- Dataset
95
- Dataset with selected values stored in output column.
96
- """
97
- return self._impl.generate(samples)
@@ -1,88 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated SetToMajorityValue for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around UniformColumnValueSetter
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform import UniformColumnValueSetter
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "SetToMajorityValue",
26
- "deprecated",
27
- "DEPRECATED: Use UniformColumnValueSetter with reduction_strategy='mode' instead. Sets all values in a column to the most frequent value",
28
- )
29
- class SetToMajorityValue(BaseBlock):
30
- """DEPRECATED: Block for setting all values in a column to the most frequent value.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use UniformColumnValueSetter with reduction_strategy='mode' instead.
34
-
35
- This block finds the most common value (mode) in a specified column and
36
- replaces all values in that column with this majority value.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- col_name: str,
43
- ) -> None:
44
- """Initialize the deprecated SetToMajorityValue.
45
-
46
- Parameters
47
- ----------
48
- block_name : str
49
- Name of the block.
50
- col_name : str
51
- Name of the column to set to majority value.
52
- """
53
- # Issue deprecation warning
54
- warnings.warn(
55
- "SetToMajorityValue is deprecated and will be removed in a future version. "
56
- "Please use UniformColumnValueSetter with reduction_strategy='mode' instead.",
57
- DeprecationWarning,
58
- stacklevel=2,
59
- )
60
-
61
- # Map old signature to new signature
62
- super().__init__(
63
- block_name=block_name,
64
- input_cols=[col_name],
65
- output_cols=[],
66
- )
67
-
68
- # Create the new block instance with mapped parameters
69
- self._new_block = UniformColumnValueSetter(
70
- block_name=block_name,
71
- input_cols=[col_name],
72
- reduction_strategy="mode",
73
- )
74
-
75
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
76
- """Generate dataset with column set to majority value using UniformColumnValueSetter.
77
-
78
- Parameters
79
- ----------
80
- samples : Dataset
81
- The input dataset to process.
82
-
83
- Returns
84
- -------
85
- Dataset
86
- The dataset with specified column set to its majority value.
87
- """
88
- return self._new_block.generate(samples, **kwargs)
@@ -1,198 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Migration utilities for backward compatibility with old flow formats."""
3
-
4
- # Standard
5
- from pathlib import Path
6
- from typing import Any, Union
7
-
8
- # Local
9
- from ..utils.logger_config import setup_logger
10
-
11
- logger = setup_logger(__name__)
12
-
13
-
14
- class FlowMigration:
15
- """Utility class for migrating old flow formats to new format."""
16
-
17
- @staticmethod
18
- def is_old_format(flow_config: Union[list[dict[str, Any]], dict[str, Any]]) -> bool:
19
- """Detect if a flow configuration is in the old format.
20
-
21
- Parameters
22
- ----------
23
- flow_config : Union[List[Dict[str, Any]], Dict[str, Any]]
24
- The loaded YAML configuration.
25
-
26
- Returns
27
- -------
28
- bool
29
- True if the configuration is in old format, False otherwise.
30
- """
31
- # Old format: Direct array of blocks
32
- # New format: Dictionary with 'metadata' and 'blocks' keys
33
- if isinstance(flow_config, list):
34
- return True
35
-
36
- if isinstance(flow_config, dict):
37
- # Check if it has the new format structure
38
- has_metadata = "metadata" in flow_config
39
- has_blocks = "blocks" in flow_config
40
-
41
- # If it has both metadata and blocks, it's new format
42
- if has_metadata and has_blocks:
43
- return False
44
-
45
- # If it doesn't have the expected new format structure but is a dict,
46
- # check if it looks like old format (all keys are block configs)
47
- if not has_metadata and not has_blocks:
48
- # Check first few items to see if they look like old block configs
49
- for value in flow_config.values():
50
- if isinstance(value, dict) and "block_type" in value:
51
- return True
52
- # If it's a dict but doesn't look like blocks, assume new format
53
- return False
54
-
55
- # If we can't determine, assume new format
56
- return False
57
-
58
- @staticmethod
59
- def migrate_to_new_format(
60
- flow_config: list[dict[str, Any]], yaml_path: str
61
- ) -> tuple[dict[str, Any], dict[str, dict[str, Any]]]:
62
- """Migrate old format flow configuration to new format.
63
-
64
- Parameters
65
- ----------
66
- flow_config : List[Dict[str, Any]]
67
- Old format flow configuration (array of blocks).
68
- yaml_path : str
69
- Path to the original YAML file for generating metadata.
70
-
71
- Returns
72
- -------
73
- tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]
74
- Tuple of (new format flow configuration, extracted runtime_params).
75
- """
76
- logger.info(f"Migrating old flow format from: {yaml_path}")
77
-
78
- # Generate default metadata
79
- flow_name = Path(yaml_path).stem
80
- metadata = FlowMigration._generate_default_metadata(flow_name)
81
-
82
- # Process blocks and extract runtime parameters
83
- migrated_blocks = []
84
- runtime_params = {}
85
-
86
- for i, block_config in enumerate(flow_config):
87
- try:
88
- migrated_block, block_runtime_params = (
89
- FlowMigration._migrate_block_config(block_config)
90
- )
91
- migrated_blocks.append(migrated_block)
92
-
93
- # Add block's runtime params if any
94
- if block_runtime_params:
95
- block_name = migrated_block.get("block_config", {}).get(
96
- "block_name"
97
- )
98
- if block_name:
99
- runtime_params[block_name] = block_runtime_params
100
-
101
- except Exception as exc:
102
- logger.warning(f"Failed to migrate block at index {i}: {exc}")
103
- # Keep original block config as fallback
104
- migrated_blocks.append(block_config)
105
-
106
- # Create new format structure
107
- new_config = {"metadata": metadata, "blocks": migrated_blocks}
108
-
109
- logger.info(f"Successfully migrated flow with {len(migrated_blocks)} blocks")
110
- logger.info(f"Extracted runtime_params for {len(runtime_params)} blocks")
111
-
112
- return new_config, runtime_params
113
-
114
- @staticmethod
115
- def _generate_default_metadata(flow_name: str) -> dict[str, Any]:
116
- """Generate default metadata for migrated flows."""
117
- # Import here to avoid circular import
118
- from ..utils.flow_identifier import get_flow_identifier
119
-
120
- metadata = {
121
- "name": flow_name,
122
- "description": f"Migrated flow: {flow_name}",
123
- "version": "1.0.0",
124
- "author": "SDG_Hub",
125
- "tags": ["migrated"],
126
- "recommended_models": {
127
- "default": "meta-llama/Llama-3.3-70B-Instruct",
128
- "compatible": [],
129
- "experimental": [],
130
- },
131
- }
132
-
133
- # Generate id for migrated flows
134
- flow_id = get_flow_identifier(flow_name)
135
- if flow_id:
136
- metadata["id"] = flow_id
137
- logger.debug(f"Generated id for migrated flow: {flow_id}")
138
-
139
- return metadata
140
-
141
- @staticmethod
142
- def _migrate_block_config(
143
- block_config: dict[str, Any],
144
- ) -> tuple[dict[str, Any], dict[str, Any]]:
145
- """Migrate individual block configuration from old to new format.
146
-
147
- Parameters
148
- ----------
149
- block_config : Dict[str, Any]
150
- Old format block configuration.
151
-
152
- Returns
153
- -------
154
- tuple[Dict[str, Any], Dict[str, Any]]
155
- Tuple of (migrated block configuration, extracted runtime_params).
156
- """
157
- if not isinstance(block_config, dict):
158
- return block_config, {}
159
-
160
- # Start with the original config
161
- migrated_config = block_config.copy()
162
- runtime_params = {}
163
-
164
- # Extract gen_kwargs as runtime_params
165
- if "gen_kwargs" in migrated_config:
166
- runtime_params = migrated_config.pop("gen_kwargs")
167
- logger.debug(f"Extracted gen_kwargs as runtime_params: {runtime_params}")
168
-
169
- # Remove unsupported fields
170
- for unsupported_field in ["drop_columns", "drop_duplicates", "batch_kwargs"]:
171
- if unsupported_field in migrated_config:
172
- migrated_config.pop(unsupported_field)
173
- logger.debug(
174
- f"Ignoring {unsupported_field} as it's not supported in new flow format"
175
- )
176
-
177
- # Handle parser_kwargs for LLMBlock (keep in block_config)
178
- if migrated_config.get("block_type") == "LLMBlock":
179
- block_config_section = migrated_config.get("block_config", {})
180
- if "parser_kwargs" in block_config_section:
181
- parser_kwargs = block_config_section["parser_kwargs"]
182
- logger.debug(f"Preserving parser_kwargs for LLMBlock: {parser_kwargs}")
183
-
184
- # Handle operator string conversion for FilterByValueBlock
185
- if migrated_config.get("block_type") == "FilterByValueBlock":
186
- block_config_section = migrated_config.get("block_config", {})
187
- if "operation" in block_config_section:
188
- operation = block_config_section["operation"]
189
- if isinstance(operation, str) and operation.startswith("operator."):
190
- # Convert "operator.eq" to "eq"
191
- block_config_section["operation"] = operation.replace(
192
- "operator.", ""
193
- )
194
- logger.debug(
195
- f"Converted operation from {operation} to {block_config_section['operation']}"
196
- )
197
-
198
- return migrated_config, runtime_params