sdg-hub 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/__init__.py +0 -22
- sdg_hub/core/blocks/transform/rename_columns.py +19 -0
- sdg_hub/core/flow/base.py +146 -81
- sdg_hub/core/utils/__init__.py +11 -3
- sdg_hub/core/utils/flow_metrics.py +116 -0
- sdg_hub/core/utils/time_estimator.py +344 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml +5 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +6 -1
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +16 -10
- {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/METADATA +2 -2
- {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/RECORD +17 -27
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +0 -29
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +0 -93
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +0 -103
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +0 -94
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +0 -479
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +0 -88
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +0 -58
- sdg_hub/core/blocks/deprecated_blocks/selector.py +0 -97
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +0 -88
- sdg_hub/core/flow/migration.py +0 -198
- {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.4.1.dist-info → sdg_hub-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,93 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""DEPRECATED: CombineColumnsBlock for backward compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper for the old CombineColumnsBlock interface.
|
5
|
-
Use transform.CombineColumnsBlock instead.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..transform.text_concat import TextConcatBlock
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"CombineColumnsBlock",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use TextConcatBlock instead. Combines multiple columns into a single column using a separator",
|
28
|
-
)
|
29
|
-
class CombineColumnsBlock(BaseBlock):
|
30
|
-
r"""DEPRECATED: Combine multiple columns into a single column using a separator.
|
31
|
-
|
32
|
-
.. deprecated::
|
33
|
-
Use `sdg_hub.blocks.transform.CombineColumnsBlock` instead.
|
34
|
-
This class will be removed in a future version.
|
35
|
-
|
36
|
-
This block concatenates values from multiple columns into a single output column,
|
37
|
-
using a specified separator between values.
|
38
|
-
|
39
|
-
Parameters
|
40
|
-
----------
|
41
|
-
block_name : str
|
42
|
-
Name of the block.
|
43
|
-
columns : List[str]
|
44
|
-
List of column names to combine.
|
45
|
-
output_col : str
|
46
|
-
Name of the column to store combined values.
|
47
|
-
separator : str, optional
|
48
|
-
String to use as separator between combined values, by default "\\n\\n".
|
49
|
-
**batch_kwargs : Dict[str, Any]
|
50
|
-
Additional keyword arguments for batch processing.
|
51
|
-
"""
|
52
|
-
|
53
|
-
def __init__(
|
54
|
-
self,
|
55
|
-
block_name: str,
|
56
|
-
columns: list[str],
|
57
|
-
output_col: str,
|
58
|
-
separator: str = "\n\n",
|
59
|
-
**batch_kwargs: dict[str, Any],
|
60
|
-
) -> None:
|
61
|
-
warnings.warn(
|
62
|
-
"CombineColumnsBlock is deprecated. Use sdg_hub.blocks.transform.TextConcatBlock instead.",
|
63
|
-
DeprecationWarning,
|
64
|
-
stacklevel=2,
|
65
|
-
)
|
66
|
-
|
67
|
-
# Initialize with dummy values for BaseBlock validation
|
68
|
-
super().__init__(
|
69
|
-
block_name=block_name, input_cols=columns, output_cols=[output_col]
|
70
|
-
)
|
71
|
-
|
72
|
-
# Create the new implementation
|
73
|
-
self._impl = TextConcatBlock(
|
74
|
-
block_name=block_name,
|
75
|
-
input_cols=columns,
|
76
|
-
output_cols=[output_col],
|
77
|
-
separator=separator,
|
78
|
-
)
|
79
|
-
|
80
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
81
|
-
"""Generate a dataset with combined columns.
|
82
|
-
|
83
|
-
Parameters
|
84
|
-
----------
|
85
|
-
samples : Dataset
|
86
|
-
Input dataset to process.
|
87
|
-
|
88
|
-
Returns
|
89
|
-
-------
|
90
|
-
Dataset
|
91
|
-
Dataset with combined values stored in output column.
|
92
|
-
"""
|
93
|
-
return self._impl.generate(samples)
|
@@ -1,88 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Deprecated DuplicateColumns for backwards compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper around DuplicateColumnsBlock
|
5
|
-
to maintain backwards compatibility with existing code and configurations.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..transform import DuplicateColumnsBlock
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"DuplicateColumns",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use DuplicateColumnsBlock instead. Duplicates existing columns with new names according to a mapping dictionary",
|
28
|
-
)
|
29
|
-
class DuplicateColumns(BaseBlock):
|
30
|
-
"""DEPRECATED: Block for duplicating existing columns with new names.
|
31
|
-
|
32
|
-
This block is deprecated and maintained only for backwards compatibility.
|
33
|
-
Please use DuplicateColumnsBlock instead.
|
34
|
-
|
35
|
-
This block creates copies of existing columns with new names as specified
|
36
|
-
in the columns mapping dictionary.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
block_name: str,
|
42
|
-
columns_map: dict[str, str],
|
43
|
-
) -> None:
|
44
|
-
"""Initialize the deprecated DuplicateColumns.
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
block_name : str
|
49
|
-
Name of the block.
|
50
|
-
columns_map : Dict[str, str]
|
51
|
-
Dictionary mapping existing column names to new column names.
|
52
|
-
Keys are existing column names, values are new column names.
|
53
|
-
"""
|
54
|
-
# Issue deprecation warning
|
55
|
-
warnings.warn(
|
56
|
-
"DuplicateColumns is deprecated and will be removed in a future version. "
|
57
|
-
"Please use DuplicateColumnsBlock instead.",
|
58
|
-
DeprecationWarning,
|
59
|
-
stacklevel=2,
|
60
|
-
)
|
61
|
-
|
62
|
-
# Map old signature to new signature
|
63
|
-
super().__init__(
|
64
|
-
block_name=block_name,
|
65
|
-
input_cols=columns_map,
|
66
|
-
output_cols=list(columns_map.values()),
|
67
|
-
)
|
68
|
-
|
69
|
-
# Create the new block instance with mapped parameters
|
70
|
-
self._new_block = DuplicateColumnsBlock(
|
71
|
-
block_name=block_name,
|
72
|
-
input_cols=columns_map,
|
73
|
-
)
|
74
|
-
|
75
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
76
|
-
"""Generate dataset with duplicated columns using the new DuplicateColumnsBlock.
|
77
|
-
|
78
|
-
Parameters
|
79
|
-
----------
|
80
|
-
samples : Dataset
|
81
|
-
The input dataset to duplicate columns from.
|
82
|
-
|
83
|
-
Returns
|
84
|
-
-------
|
85
|
-
Dataset
|
86
|
-
The dataset with additional duplicated columns.
|
87
|
-
"""
|
88
|
-
return self._new_block.generate(samples, **kwargs)
|
@@ -1,103 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Deprecated FilterByValueBlock for backwards compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper around ColumnValueFilterBlock
|
5
|
-
to maintain backwards compatibility with existing code and configurations.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any, Callable, Optional, Union
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..filtering import ColumnValueFilterBlock
|
19
|
-
from ..registry import BlockRegistry
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"FilterByValueBlock",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use ColumnValueFilterBlock instead. Filters datasets based on column values using various comparison operations",
|
28
|
-
)
|
29
|
-
class FilterByValueBlock(BaseBlock):
|
30
|
-
"""DEPRECATED: A block for filtering datasets based on column values.
|
31
|
-
|
32
|
-
This block is deprecated and maintained only for backwards compatibility.
|
33
|
-
Please use ColumnValueFilterBlock instead.
|
34
|
-
|
35
|
-
This block allows filtering of datasets using various operations (e.g., equals, contains)
|
36
|
-
on specified column values, with optional data type conversion.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
block_name: str,
|
42
|
-
filter_column: str,
|
43
|
-
filter_value: Union[Any, list[Any]],
|
44
|
-
operation: Callable[[Any, Any], bool],
|
45
|
-
convert_dtype: Optional[Union[type[float], type[int]]] = None,
|
46
|
-
**batch_kwargs: dict[str, Any],
|
47
|
-
) -> None:
|
48
|
-
"""Initialize the deprecated FilterByValueBlock.
|
49
|
-
|
50
|
-
Parameters
|
51
|
-
----------
|
52
|
-
block_name : str
|
53
|
-
Name of the block.
|
54
|
-
filter_column : str
|
55
|
-
Column name to filter on.
|
56
|
-
filter_value : Union[Any, list[Any]]
|
57
|
-
The value(s) to filter by.
|
58
|
-
operation : Callable[[Any, Any], bool]
|
59
|
-
A binary operator from the operator module.
|
60
|
-
convert_dtype : Optional[Union[type[float], type[int]]], optional
|
61
|
-
Type to convert the filter column to.
|
62
|
-
**batch_kwargs : dict[str, Any]
|
63
|
-
Additional batch processing arguments.
|
64
|
-
"""
|
65
|
-
# Issue deprecation warning
|
66
|
-
warnings.warn(
|
67
|
-
"FilterByValueBlock is deprecated and will be removed in a future version. "
|
68
|
-
"Please use ColumnValueFilterBlock instead.",
|
69
|
-
DeprecationWarning,
|
70
|
-
stacklevel=2,
|
71
|
-
)
|
72
|
-
|
73
|
-
# Map old signature to new signature
|
74
|
-
super().__init__(
|
75
|
-
block_name=block_name,
|
76
|
-
input_cols=[filter_column],
|
77
|
-
output_cols=[],
|
78
|
-
)
|
79
|
-
|
80
|
-
# Create the new block instance with mapped parameters
|
81
|
-
self._new_block = ColumnValueFilterBlock(
|
82
|
-
block_name=block_name,
|
83
|
-
input_cols=[filter_column],
|
84
|
-
output_cols=[],
|
85
|
-
filter_value=filter_value,
|
86
|
-
operation=operation,
|
87
|
-
convert_dtype=convert_dtype,
|
88
|
-
)
|
89
|
-
|
90
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
91
|
-
"""Generate filtered dataset using the new ColumnValueFilterBlock.
|
92
|
-
|
93
|
-
Parameters
|
94
|
-
----------
|
95
|
-
samples : Dataset
|
96
|
-
The input dataset to filter.
|
97
|
-
|
98
|
-
Returns
|
99
|
-
-------
|
100
|
-
Dataset
|
101
|
-
The filtered dataset.
|
102
|
-
"""
|
103
|
-
return self._new_block.generate(samples, **kwargs)
|
@@ -1,94 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Deprecated FlattenColumnsBlock for backwards compatibility.
|
3
|
-
|
4
|
-
This module provides a deprecated wrapper around MeltColumnsBlock
|
5
|
-
to maintain backwards compatibility with existing code and configurations.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from typing import Any
|
10
|
-
import warnings
|
11
|
-
|
12
|
-
# Third Party
|
13
|
-
from datasets import Dataset
|
14
|
-
|
15
|
-
# Local
|
16
|
-
from ...utils.logger_config import setup_logger
|
17
|
-
from ..base import BaseBlock
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..transform import MeltColumnsBlock
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register(
|
25
|
-
"FlattenColumnsBlock",
|
26
|
-
"deprecated",
|
27
|
-
"DEPRECATED: Use MeltColumnsBlock instead. Transforms wide dataset format into long format by melting columns into rows",
|
28
|
-
)
|
29
|
-
class FlattenColumnsBlock(BaseBlock):
|
30
|
-
"""DEPRECATED: Block for flattening multiple columns into a long format.
|
31
|
-
|
32
|
-
This block is deprecated and maintained only for backwards compatibility.
|
33
|
-
Please use MeltColumnsBlock instead.
|
34
|
-
|
35
|
-
This block transforms a wide dataset format into a long format by melting
|
36
|
-
specified columns into rows, creating new variable and value columns.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
block_name: str,
|
42
|
-
var_cols: list[str],
|
43
|
-
value_name: str,
|
44
|
-
var_name: str,
|
45
|
-
) -> None:
|
46
|
-
"""Initialize the deprecated FlattenColumnsBlock.
|
47
|
-
|
48
|
-
Parameters
|
49
|
-
----------
|
50
|
-
block_name : str
|
51
|
-
Name of the block.
|
52
|
-
var_cols : List[str]
|
53
|
-
List of column names to be melted into rows.
|
54
|
-
value_name : str
|
55
|
-
Name of the new column that will contain the values.
|
56
|
-
var_name : str
|
57
|
-
Name of the new column that will contain the variable names.
|
58
|
-
"""
|
59
|
-
# Issue deprecation warning
|
60
|
-
warnings.warn(
|
61
|
-
"FlattenColumnsBlock is deprecated and will be removed in a future version. "
|
62
|
-
"Please use MeltColumnsBlock instead.",
|
63
|
-
DeprecationWarning,
|
64
|
-
stacklevel=2,
|
65
|
-
)
|
66
|
-
|
67
|
-
# Map old signature to new signature
|
68
|
-
super().__init__(
|
69
|
-
block_name=block_name,
|
70
|
-
input_cols=var_cols,
|
71
|
-
output_cols=[value_name, var_name],
|
72
|
-
)
|
73
|
-
|
74
|
-
# Create the new block instance with mapped parameters
|
75
|
-
self._new_block = MeltColumnsBlock(
|
76
|
-
block_name=block_name,
|
77
|
-
input_cols=var_cols,
|
78
|
-
output_cols=[value_name, var_name],
|
79
|
-
)
|
80
|
-
|
81
|
-
def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
|
82
|
-
"""Generate flattened dataset using the new MeltColumnsBlock.
|
83
|
-
|
84
|
-
Parameters
|
85
|
-
----------
|
86
|
-
samples : Dataset
|
87
|
-
The input dataset to flatten.
|
88
|
-
|
89
|
-
Returns
|
90
|
-
-------
|
91
|
-
Dataset
|
92
|
-
The flattened dataset in long format.
|
93
|
-
"""
|
94
|
-
return self._new_block.generate(samples, **kwargs)
|