sdg-hub 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/__init__.py +0 -22
  3. sdg_hub/core/blocks/transform/rename_columns.py +19 -0
  4. sdg_hub/core/flow/base.py +146 -81
  5. sdg_hub/core/utils/__init__.py +11 -3
  6. sdg_hub/core/utils/flow_metrics.py +116 -0
  7. sdg_hub/core/utils/time_estimator.py +344 -0
  8. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
  9. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
  10. sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
  11. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
  12. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +16 -10
  13. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
  14. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +17 -27
  15. sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
  16. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
  17. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
  18. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
  19. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
  20. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
  21. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
  22. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
  23. sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
  24. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
  25. sdg_hub/core/flow/migration.py +0 -198
  26. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,93 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """DEPRECATED: CombineColumnsBlock for backward compatibility.
3
-
4
- This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
5
- Use transform.CombineColumnsBlock instead.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform.text_concat import TextConcatBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "CombineColumnsBlock",
26
- "deprecated",
27
- "DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
28
- )
29
- class CombineColumnsBlock(BaseBlock):
30
- r"""DEPRECATED: Combine multiple columns into a single column using a separator.
31
-
32
- .. deprecated::
33
- Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
34
- This class will be removed in a future version.
35
-
36
- This block concatenates values from multiple columns into a single output column,
37
- using a specified separator between values.
38
-
39
- Parameters
40
- ----------
41
- block_name : str
42
- Name of the block.
43
- columns : List[str]
44
- List of column names to combine.
45
- output_col : str
46
- Name of the column to store combined values.
47
- separator : str, optional
48
- String to use as separator between combined values, by default "\\n\\n".
49
- **batch_kwargs : Dict[str, Any]
50
- Additional keyword arguments for batch processing.
51
- """
52
-
53
- def __init__(
54
- self,
55
- block_name: str,
56
- columns: list[str],
57
- output_col: str,
58
- separator: str = "\n\n",
59
- **batch_kwargs: dict[str, Any],
60
- ) -> None:
61
- warnings.warn(
62
- "CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
63
- DeprecationWarning,
64
- stacklevel=2,
65
- )
66
-
67
- # Initialize with dummy values for BaseBlock validation
68
- super().__init__(
69
- block_name=block_name, input_cols=columns, output_cols=[output_col]
70
- )
71
-
72
- # Create the new implementation
73
- self._impl = TextConcatBlock(
74
- block_name=block_name,
75
- input_cols=columns,
76
- output_cols=[output_col],
77
- separator=separator,
78
- )
79
-
80
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
81
- """Generate a dataset with combined columns.
82
-
83
- Parameters
84
- ----------
85
- samples : Dataset
86
- Input dataset to process.
87
-
88
- Returns
89
- -------
90
- Dataset
91
- Dataset with combined values stored in output column.
92
- """
93
- return self._impl.generate(samples)
@@ -1,88 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated DuplicateColumns for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around DuplicateColumnsBlock
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform import DuplicateColumnsBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "DuplicateColumns",
26
- "deprecated",
27
- "DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
28
- )
29
- class DuplicateColumns(BaseBlock):
30
- """DEPRECATED: Block for duplicating existing columns with new names.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use DuplicateColumnsBlock instead.
34
-
35
- This block creates copies of existing columns with new names as specified
36
- in the columns mapping dictionary.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- columns_map: dict[str, str],
43
- ) -> None:
44
- """Initialize the deprecated DuplicateColumns.
45
-
46
- Parameters
47
- ----------
48
- block_name : str
49
- Name of the block.
50
- columns_map : Dict[str, str]
51
- Dictionary mapping existing column names to new column names.
52
- Keys are existing column names, values are new column names.
53
- """
54
- # Issue deprecation warning
55
- warnings.warn(
56
- "DuplicateColumns is deprecated and will be removed in a future version. "
57
- "Please use DuplicateColumnsBlock instead.",
58
- DeprecationWarning,
59
- stacklevel=2,
60
- )
61
-
62
- # Map old signature to new signature
63
- super().__init__(
64
- block_name=block_name,
65
- input_cols=columns_map,
66
- output_cols=list(columns_map.values()),
67
- )
68
-
69
- # Create the new block instance with mapped parameters
70
- self._new_block = DuplicateColumnsBlock(
71
- block_name=block_name,
72
- input_cols=columns_map,
73
- )
74
-
75
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
76
- """Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
77
-
78
- Parameters
79
- ----------
80
- samples : Dataset
81
- The input dataset to duplicate columns from.
82
-
83
- Returns
84
- -------
85
- Dataset
86
- The dataset with additional duplicated columns.
87
- """
88
- return self._new_block.generate(samples, **kwargs)
@@ -1,103 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated FilterByValueBlock for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around ColumnValueFilterBlock
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any, Callable, Optional, Union
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..filtering import ColumnValueFilterBlock
19
- from ..registry import BlockRegistry
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "FilterByValueBlock",
26
- "deprecated",
27
- "DEPRECATED: Use ColumnValueFilterBlock instead. Filters datasets based on column values using various comparison operations",
28
- )
29
- class FilterByValueBlock(BaseBlock):
30
- """DEPRECATED: A block for filtering datasets based on column values.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use ColumnValueFilterBlock instead.
34
-
35
- This block allows filtering of datasets using various operations (e.g., equals, contains)
36
- on specified column values, with optional data type conversion.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- filter_column: str,
43
- filter_value: Union[Any, list[Any]],
44
- operation: Callable[[Any, Any], bool],
45
- convert_dtype: Optional[Union[type[float], type[int]]] = None,
46
- **batch_kwargs: dict[str, Any],
47
- ) -> None:
48
- """Initialize the deprecated FilterByValueBlock.
49
-
50
- Parameters
51
- ----------
52
- block_name : str
53
- Name of the block.
54
- filter_column : str
55
- Column name to filter on.
56
- filter_value : Union[Any, list[Any]]
57
- The value(s) to filter by.
58
- operation : Callable[[Any, Any], bool]
59
- A binary operator from the operator module.
60
- convert_dtype : Optional[Union[type[float], type[int]]], optional
61
- Type to convert the filter column to.
62
- **batch_kwargs : dict[str, Any]
63
- Additional batch processing arguments.
64
- """
65
- # Issue deprecation warning
66
- warnings.warn(
67
- "FilterByValueBlock is deprecated and will be removed in a future version. "
68
- "Please use ColumnValueFilterBlock instead.",
69
- DeprecationWarning,
70
- stacklevel=2,
71
- )
72
-
73
- # Map old signature to new signature
74
- super().__init__(
75
- block_name=block_name,
76
- input_cols=[filter_column],
77
- output_cols=[],
78
- )
79
-
80
- # Create the new block instance with mapped parameters
81
- self._new_block = ColumnValueFilterBlock(
82
- block_name=block_name,
83
- input_cols=[filter_column],
84
- output_cols=[],
85
- filter_value=filter_value,
86
- operation=operation,
87
- convert_dtype=convert_dtype,
88
- )
89
-
90
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
91
- """Generate filtered dataset using the new ColumnValueFilterBlock.
92
-
93
- Parameters
94
- ----------
95
- samples : Dataset
96
- The input dataset to filter.
97
-
98
- Returns
99
- -------
100
- Dataset
101
- The filtered dataset.
102
- """
103
- return self._new_block.generate(samples, **kwargs)
@@ -1,94 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Deprecated FlattenColumnsBlock for backwards compatibility.
3
-
4
- This module provides a deprecated wrapper around MeltColumnsBlock
5
- to maintain backwards compatibility with existing code and configurations.
6
- """
7
-
8
- # Standard
9
- from typing import Any
10
- import warnings
11
-
12
- # Third Party
13
- from datasets import Dataset
14
-
15
- # Local
16
- from ...utils.logger_config import setup_logger
17
- from ..base import BaseBlock
18
- from ..registry import BlockRegistry
19
- from ..transform import MeltColumnsBlock
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register(
25
- "FlattenColumnsBlock",
26
- "deprecated",
27
- "DEPRECATED: Use MeltColumnsBlock instead. Transforms wide dataset format into long format by melting columns into rows",
28
- )
29
- class FlattenColumnsBlock(BaseBlock):
30
- """DEPRECATED: Block for flattening multiple columns into a long format.
31
-
32
- This block is deprecated and maintained only for backwards compatibility.
33
- Please use MeltColumnsBlock instead.
34
-
35
- This block transforms a wide dataset format into a long format by melting
36
- specified columns into rows, creating new variable and value columns.
37
- """
38
-
39
- def __init__(
40
- self,
41
- block_name: str,
42
- var_cols: list[str],
43
- value_name: str,
44
- var_name: str,
45
- ) -> None:
46
- """Initialize the deprecated FlattenColumnsBlock.
47
-
48
- Parameters
49
- ----------
50
- block_name : str
51
- Name of the block.
52
- var_cols : List[str]
53
- List of column names to be melted into rows.
54
- value_name : str
55
- Name of the new column that will contain the values.
56
- var_name : str
57
- Name of the new column that will contain the variable names.
58
- """
59
- # Issue deprecation warning
60
- warnings.warn(
61
- "FlattenColumnsBlock is deprecated and will be removed in a future version. "
62
- "Please use MeltColumnsBlock instead.",
63
- DeprecationWarning,
64
- stacklevel=2,
65
- )
66
-
67
- # Map old signature to new signature
68
- super().__init__(
69
- block_name=block_name,
70
- input_cols=var_cols,
71
- output_cols=[value_name, var_name],
72
- )
73
-
74
- # Create the new block instance with mapped parameters
75
- self._new_block = MeltColumnsBlock(
76
- block_name=block_name,
77
- input_cols=var_cols,
78
- output_cols=[value_name, var_name],
79
- )
80
-
81
- def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
82
- """Generate flattened dataset using the new MeltColumnsBlock.
83
-
84
- Parameters
85
- ----------
86
- samples : Dataset
87
- The input dataset to flatten.
88
-
89
- Returns
90
- -------
91
- Dataset
92
- The flattened dataset in long format.
93
- """
94
- return self._new_block.generate(samples, **kwargs)